source("eossa_new.r")
## Loading required package: svd
## Loading required package: forecast
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## WARNING: Rssa was compiled without FFTW support.
## The speed of the routines will be slower as well.
##
## Attaching package: 'Rssa'
## The following object is masked from 'package:stats':
##
## decompose
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stats)
library(readr)
library(stats)
library(Rssa)
library(hpfilter)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(lattice)
library(quadprog)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following object is masked from 'package:Rssa':
##
## decompose
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(urca)
library(tseries)
set.seed(52)
date <- 1:169
ts_cos <- ts(cos(2 * pi * date / 13 + 2), frequency = 13)
plot.ts(ts_cos, type="l")
spec.pgram(ts_cos, taper = 0, log='no', fast = FALSE)
Явно выделенный пик.
date <- 1:150
ts_cos_2 <- ts(cos(2 * pi * date / 13 + 2), frequency = 13)
plot.ts(ts_cos_2, type="l")
spec.pgram(ts_cos_2, taper = 0, log='no', fast = FALSE)
Видим растекание периодограммы.
ts_white <- ts(rnorm(1000))
plot.ts(ts_white, type="l")
spec.pgram(ts_white, taper = 0, log='no', fast = FALSE)
Видно, что значения абсолютно случайны, с близким к экспоненциальному распределению.
generate_red_noise_ar1 <- function(n, phi, sigma = 1, x0 = rnorm(1, 0, 1 / sqrt(1 - phi ** 2))) {
x <- numeric(n)
x[1] <- x0
for (i in 2:n) {
x[i] <- phi * x[i - 1] + rnorm(1, mean = 0, sd = sigma)
}
x
}
n <- 1000
phi <- 0.9
red_noise <- generate_red_noise_ar1(n, phi)
ts_red <- ts(red_noise)
plot.ts(ts_red, type="l")
spec.pgram(ts_red, taper = 0, log='no', fast = FALSE)
Возьмём \(2\) косинуса и красный шум и будем менять длину ряда:
for (i in 1:8) {
last <- i * 100
ts_w_cos <- ts(0.8 * cos(2 * pi * (1:(last)) / 10 + 3) + cos(2 * pi * (1:(last)) / 20 + 3) + 0.5 * generate_red_noise_ar1(last, phi) + rnorm(last))
spec.pgram(ts_w_cos, taper = 0, log='no', fast = FALSE)
}
Видим увеличение пиков сезонности по сравнению с шумовыми компонентами.
Данные по выработке электричеству в США:
electricity_data <- read_csv("electricity_data.csv")
## New names:
## Rows: 255 Columns: 311
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (55): Connecticut : electric utility, Connecticut : all commercial, Ma... dbl
## (255): United States : all sectors, United States : electric utility, U... date
## (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
electricity_data <- electricity_data[1:252, ]
ts_electr_1 <- ts(as.vector(electricity_data$`United States : all sectors`), frequency = 12)
plot.ts(ts_electr_1, type="l")
spec.pgram(ts_electr_1, taper = 0, log='no', fast = FALSE, detrend = TRUE)
Видим отчётливые пики, то есть наблюдается периодичность.
Можно рассмотреть другой ряд:
electricity_data$electr <- electricity_data$`United States : independent power producers`
ts_electr_2 <- ts(as.vector(electricity_data$electr), frequency = 12)
plot.ts(ts_electr_2, type="l")
spec_electr <- spec.pgram(ts_electr_2, taper = 0, log='no', fast = FALSE, detrend = TRUE)
Здесь чуть лучше виден тренд.
Возьмём аппроксимацию ряда полиномом 4-ой степени:
lm_electr <- lm(electr ~ poly(...1, 4, raw = TRUE), data = electricity_data)
summary(lm_electr)
##
## Call:
## lm(formula = electr ~ poly(...1, 4, raw = TRUE), data = electricity_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22253 -9052 -3439 6268 39320
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.144e+06 2.484e+06 -3.278 0.00119 **
## poly(...1, 4, raw = TRUE)1 2.083e+03 6.725e+02 3.098 0.00217 **
## poly(...1, 4, raw = TRUE)2 -1.960e-01 6.769e-02 -2.896 0.00412 **
## poly(...1, 4, raw = TRUE)3 8.153e-06 3.003e-06 2.715 0.00710 **
## poly(...1, 4, raw = TRUE)4 -1.262e-10 4.956e-11 -2.547 0.01149 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12960 on 247 degrees of freedom
## Multiple R-squared: 0.5973, Adjusted R-squared: 0.5907
## F-statistic: 91.57 on 4 and 247 DF, p-value: < 2.2e-16
ts_electr_poly <- ts(predict(lm_electr), frequency = 12)
plot(ts_electr_2, type = "l")
lines(ts_electr_poly)
Применим локальную регрессию с разным параметрами сглаживания:
electricity_data$index <- 1:nrow(electricity_data)
loess_electr_10 <- loess(electr ~ index, data = electricity_data, span = 0.1)
loess_electr_30 <- loess(electr ~ index, data = electricity_data, span = 0.3)
loess_electr_50 <- loess(electr ~ index, data = electricity_data, span = 0.5)
ts_electr_loess_10 <- ts(predict(loess_electr_10), frequency = 12)
ts_electr_loess_30 <- ts(predict(loess_electr_30), frequency = 12)
ts_electr_loess_50 <- ts(predict(loess_electr_50), frequency = 12)
plot(ts_electr_2, type = "l")
lines(ts_electr_loess_10, col = "red")
lines(ts_electr_loess_30, col = "blue")
lines(ts_electr_loess_50, col = "green")
legend(x="bottomright", c("time series", "0.1 span", "0.3 span", "0.5 span"), col = c("black", "red", "blue", "green"), lty = 1, lwd = 1)
\(0.3\) — оптимальный параметр для ряда.
Добавим к данным два выброса и учтём веса для этих наблюдений:
electr_data_outliers <- data.frame(index = electricity_data$index, electr = electricity_data$electr)
electr_data_outliers[25, 2] <- 50000
electr_data_outliers[200, 2] <- 50000
loess_electr_out_30 <- loess(electr ~ index, data = electr_data_outliers, family="symmetric", span = 0.3, control = loess.control(surface = "direct", iterations = 1))
loess_electr_out_noweight_30 <- loess(electr ~ index, data = electr_data_outliers, family="symmetric", span = 0.3, control = loess.control(surface = "direct", iterations = 25))
weight_loess <- rep(1, nrow(electr_data_outliers))
weight_loess[25] <- 0.1
weight_loess[200] <- 0.1
loess_electr_out_off_30 <- loess(electr ~ index, data = electr_data_outliers, family="symmetric", weights = weight_loess, span = 0.3)
ts_electr_2_out <- ts(electr_data_outliers$electr, frequency = 12)
ts_electr_loess_out_30 <- ts(predict(loess_electr_out_30), frequency = 12)
ts_electr_loess_out_noweight_30 <- ts(predict(loess_electr_out_noweight_30), frequency = 12)
ts_electr_loess_out_off_30 <- ts(predict(loess_electr_out_off_30), frequency = 12)
plot(ts_electr_2_out, type = "l")
lines(ts_electr_loess_out_noweight_30, col = "purple")
lines(ts_electr_loess_out_30, col = "blue")
lines(ts_electr_loess_out_off_30, col = "green")
legend(x = "bottomright", c("original", "iterative", "no iterative", "weight"), col = c("black", "purple", "blue", "green"), lty = 1, lwd = 1)
plot(loess_electr_out_30$residuals, type = "l", main = "no iterative")
plot(loess_electr_out_noweight_30$residuals, type = "l", main = "iterative")
plot(loess_electr_out_off_30$residuals, type = "l", main = "weight")
Учёт весов (ручной (weight) и автоматический (iterative)) действительно дал близкий к правильному результат.
Применим фильтр Ходрика-Прескота для выделения тренда:
y = data.frame(electr = electricity_data$electr)
ts_electr_hp_1 = ts(hp2(y, lambda = 14400)$electr, frequency = 12)
ts_electr_hp_2 = ts(hp2(y, lambda = 129606)$electr, frequency = 12)
plot(ts_electr_2, type="l")
lines(ts_electr_hp_1, col="red")
lines(ts_electr_hp_2, col="blue")
legend(x="bottomright", c("time series", "lambda = 14400", "lambda = 129606"), col = c("black", "red", "blue"), lty = 1, lwd = 1)
\(\lambda = 14400\) — оптимальный параметр, судя по поведению ряда.
acf(ts_white)
acf(ts_red)
acf(ts_cos)
acf(0.5 * (1:100) - 10)
acf(ts_electr_2)
Тренд + сезонность.
afc <- function(filter, omega) {
k <- seq_along(filter) - 1
h <- function(o) sum(rev(filter) * exp(-k*1i * o))
abs(sapply(omega, h))
}
freq <- seq(0, pi, 0.001)
omega <- freq/2/pi
filt <- rep(1, 10)
plot(afc(filt, freq) ~ omega, type = "l")
Применим к косинусу:
ts_cos_filter <- stats::filter(ts_cos, rep(1, 5) / 5, sides = 2)
ts_cos_filter_shift <- stats::filter(ts_cos, rep(1, 5) / 5, sides = 1)
plot(ts_cos, type = "l", col = "gray", lwd = 1, main = "Фильтрация временного ряда", ylab = "Значение", xlab = "Время")
lines(ts_cos_filter, col = "red", lwd = 2)
lines(ts_cos_filter_shift, col = "blue", lwd = 2)
Видно смещение у одностороннего фильтра и уменьшение амплитуды у обоих.
Фильтр с продлением на концы:
filter_continue <- function(time_series, coef_filter, normalize = TRUE){
n <- length(time_series)
k <- length(coef_filter)
filtered_series <- numeric(n)
for (i in 1:n) {
left_bound <- max(1, i - (k - 1) %/% 2)
right_bound <- min(n, i + k %/% 2)
sub_series <- time_series[left_bound:right_bound]
sub_filter <- coef_filter[(left_bound - i + (k - 1) %/% 2 + 1):(right_bound - i + (k - 1) %/% 2 + 1)]
if (normalize) {
sub_filter <- sub_filter / sum(abs(sub_filter))
}
filtered_series[i] <- sum(sub_series * sub_filter)
}
filtered_series
}
Теперь к периодограмме белого шума:
ts_pgram_white <- ts(spec.pgram(ts_white, taper = 0, log='no', fast = FALSE, plot = FALSE)$spec)
ts_pgram_white_filter <- filter_continue(ts_pgram_white, rep(1, 20))
ts_pgram_white_filter_80 <- filter_continue(ts_pgram_white, rep(1, 80))
plot(ts_pgram_white)
lines(ts_pgram_white_filter, type = "l", col = "blue", lwd = 1)
lines(ts_pgram_white_filter_80, type = "l", col = "red", lwd = 1)
К периодограмме красного шума:
ts_red_7 <- ts(generate_red_noise_ar1(n, 0.7))
ts_pgram_red <- ts(spec.pgram(ts_red_7, taper = 0, log='no', fast = FALSE, plot = FALSE)$spec)
ts_pgram_red_filter <- filter_continue(ts_pgram_red, rep(1, 40))
plot(ts_pgram_red)
lines(ts_pgram_red_filter, type = "l", col = "blue", lwd = 1)
filt <- c(-1,1)
plot(afc(filt, freq) ~ omega, type = "l")
Попробуем применить фильтр к ряду с сезонностью и трендом:
ts_lin_sin <- ts(1/2 * (1:100) + cos(2 * pi * (1:100) / 5))
plot(ts_lin_sin)
ts_lin_sin_filter <- filter_continue(ts_lin_sin, c(-1, 1))
plot(ts_lin_sin_filter, type = "l")
pgram_lin_sin <- spec.pgram(ts_lin_sin, taper = 0, log='no', fast = FALSE, demean = FALSE, detrend = FALSE)
pgram_lin_sin_filter_culc <- pgram_lin_sin
pgram_lin_sin_filter_culc$spec <- pgram_lin_sin$spec * afc(c(-1, 1), seq(pi / 50, pi, pi / 50)) ** 2
pgram_lin_sin_filter <- spec.pgram(ts_lin_sin_filter, taper = 0, log='no', fast = FALSE, demean = FALSE, detrend = FALSE)
plot(pgram_lin_sin_filter_culc$spec, type = "l")
Видим, что фильтр действительно убирает тренд.
ts_electr_2_filter_24_1 <- ts(filter_continue(ts_electr_2, c(1 / 2, rep(1, 23), 1 / 2)), frequency = 12)
ts_electr_2_filter_24_2 <- ts(filter_continue(ts_electr_2_filter_24_1, c(1 / 2, rep(1, 23), 1 / 2)), frequency = 12)
ts_electr_2_filter_34 <- ts(filter_continue(ts_electr_2, c(1 / 2, rep(1, 47), 1 / 2)), frequency = 12)
plot(ts_electr_2, type = "l", col = "gray", lwd = 1, main = "Фильтрация временного ряда", ylab = "Значение", xlab = "Время")
lines(ts_electr_2_filter_24_1, col = "red", lwd = 2)
lines(ts_electr_2_filter_34, col = "blue", lwd = 2)
lines(ts_electr_2_filter_24_2, col = "green", lwd = 2)
legend(x = "bottomright", c("original", "filter 24", "filter 34", "filter 24 (x2)"), col = c("gray", "red", "blue", "green"), lty = 1, lwd = c(1, 2, 2, 2))
spec.pgram(ts_electr_2_filter_24_2, taper = 0, log='no', fast = FALSE, detrend = TRUE, na.action = na.omit)
Тренд явно выделился по сравнению с прошлой периодограммой.
ts_exp <- ts(1.1 ** (1:100))
s_exp <- ssa(ts_exp, kind = "1d-ssa", svd.method = "svd")
plot(ts_exp)
plot(s_exp)
plot(s_exp, type = "vectors", idx = 1:3)
plot(s_exp, type = "paired", idx = 1:3, plot.contrib = FALSE)
plot(wcor(s_exp, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_exp <- reconstruct(s_exp,
groups = list(Trend = 1))
plot(r_exp, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
Ранг ряда - 1.
ts_poli <- ts(2 * (1:169) ^2 - 3 * (1:169) + 1)
s_poli <- ssa(ts_poli, kind = "1d-ssa", svd.method = "svd")
plot(ts_poli)
plot(s_poli)
plot(s_poli, type = "vectors", idx = 1:5)
plot(s_poli, type = "series", groups = list(1, 1:2, 1:3))
plot(s_poli, type = "paired", idx = 1:3, plot.contrib = FALSE)
plot(wcor(s_poli, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_poli <- reconstruct(s_poli,
groups = list(Trend = 1:3))
plot(r_poli, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
s_poli$sigma[1:5]
## [1] 1.738257e+06 1.298326e+05 7.595883e+03 2.711291e-10 1.654407e-10
Ранг ряда - степень полинома + 1.
s_cos <- ssa(ts_cos, kind = "1d-ssa", svd.method = "svd")
plot(ts_cos)
plot(s_cos)
plot(s_cos, type = "vectors", idx = 1:3)
plot(s_exp, type = "series", groups = 1:10)
plot(s_cos, type = "paired", idx = 1:3, plot.contrib = FALSE)
plot(wcor(s_cos, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_cos <- reconstruct(s_cos,
groups = list(Seasonality = 1:2))
plot(r_cos, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
Ранг ряда - 2.
ts_cos_poli <- ts(cos(2 * pi * (1:169) / 13) * (2 * (1:169) ** 2 - 3 * (1:169) + 1))
s_cos_poli <- ssa(ts_cos_poli, kind = "1d-ssa", svd.method = "svd")
plot(ts_cos_poli)
plot(s_cos_poli)
plot(s_cos_poli, type = "vectors", idx = 1:7)
plot(s_cos_poli, type = "paired", idx = 1:6, plot.contrib = FALSE)
plot(wcor(s_cos_poli, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_cos_poli <- reconstruct(s_cos_poli,
groups = list(Seasonality = 1:6))
plot(r_cos_poli, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
Ранг ряда - 2 (косинус) умножить на 3 (полином) = 6.
ts_cos_cos <- ts(cos(2 * pi * (1:143) / 11) + 4 * cos(2 * pi * (1:143) / 13))
s_cos_cos <- ssa(ts_cos_cos, kind = "1d-ssa", svd.method = "svd")
plot(ts_cos_cos)
plot(s_cos_cos)
plot(s_cos_cos, type = "vectors", idx = 1:5)
plot(s_cos_cos, type = "paired", idx = 1:6, plot.contrib = FALSE)
plot(wcor(s_cos_cos, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_cos_cos <- reconstruct(s_cos_cos,
groups = list(Seasonality = 1:4))
plot(r_cos_cos, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
Видна точная разделимость. Но если амплитуды равны:
ts_cos_cos_2 <- ts(cos(2 * pi * (1:143) / 11) + cos(2 * pi * (1:143) / 13))
s_cos_cos_2 <- ssa(ts_cos_cos_2, kind = "1d-ssa", svd.method = "svd")
plot(ts_cos_cos_2)
plot(s_cos_cos_2)
plot(s_cos_cos_2, type = "vectors", idx = 1:5)
plot(s_cos_cos_2, type = "paired", idx = 1:6, plot.contrib = FALSE)
plot(wcor(s_cos_cos_2, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_cos_cos_2 <- reconstruct(s_cos_cos_2,
groups = list(Seasonality = 1:4))
plot(r_cos_cos_2, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
То точная разделимость пропадает.
ts_cos_lin <- 5 * ts_cos + (1.5 * (1:169) - 2)
s_cos_lin <- ssa(ts_cos_lin, L = 78, kind = "1d-ssa", svd.method = "svd")
plot(ts_cos_lin)
plot(s_cos_lin)
plot(s_cos_lin, type = "vectors", idx = 1:5)
plot(s_cos_lin, type = "paired", idx = 1:4, plot.contrib = FALSE)
plot(wcor(s_cos_lin, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_cos_lin <- reconstruct(s_cos_lin,
groups = list(Trend = 1:2, Seasonality = 3:4))
plot(r_cos_lin, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
Теоретически точной разделимости линейного ряда и косинуса нет, но в данном случае длина ряда позволяет разделить их.
L_electr <- length(ts_electr_2) %/% 2 %/% 12 * 12
s_electr <- ssa(ts_electr_2, kind = "1d-ssa", svd.method = "svd", L = L_electr)
plot(ts_electr_2)
plot(s_electr)
plot(s_electr, type = "vectors", idx = 1:15)
plot(s_electr, type = "series", groups = list(c(1, 6), c(2:5, 7:8, 9:10)))
plot(s_electr, type = "paired", idx = 1:15, plot.contrib = FALSE)
plot(wcor(s_electr, groups = 1:30),
scales = list(at = c(10, 20, 30)))
r_electr <- reconstruct(s_electr,
groups = list(Trend = c(1, 6), Seasonality = c(2:5, 7:8, 9:10)))
plot(r_electr, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
spec.pgram(residuals(r_electr), taper = 0, log='no', fast = FALSE, demean = FALSE, detrend = FALSE)
Видно, что осталась какая-то сезонность и тренд, но обнаружить руками это достаточно трудно.
Попробуем автоматическую группировку для тренда и сезонности:
coef <- c(1 - 0.02, 1 + 0.02)
freq.bins.seas = list(trend = 0.05, s12 = 1/12 * coef, s6 = 1/6 * coef,
s4 = 1/4 * coef, s3 = 1/3 * coef,
s2.4 = 1/2.4 * coef, s2 = 1/2 * coef)
g_electr <- grouping.auto(s_electr, base = "series",
freq.bins = freq.bins.seas,
threshold = 0.7, groups = 1:20)
print(g_electr$trend)
## [1] 1 6 14
plot(g_electr, order = TRUE, type = "b")
r_electr_auto <- reconstruct(s_electr, g_electr)
plot(r_electr_auto, plot.method = "xyplot", superpose = TRUE,
add.residuals = FALSE)
Видим, что тренд неплохо выделился, но, возможно, немного смешался с сезонностью. А по спектрограмме видно, что нижние и основные частоты были убраны:
spec_electr_auto <- spec.pgram(residuals(r_electr_auto), taper = 0, log='no', fast = FALSE, demean = FALSE, detrend = FALSE, plot = FALSE)
w.pay <- seq(0, length.out = length(spec_electr$spec),
by = 1/length(ts_electr_2))
xyplot(log(spec_electr$spec) + log(spec_electr_auto$spec) ~ w.pay,
type = "l", xlab = NULL, ylab = NULL)
Применим улучшение разделимости DerivSSA:
plot(s_electr, type = "vectors", idx = 1:20)
fss_electr <- fossa(s_electr, nested.groups = 1:16, gamma = 10)
plot(fss_electr, type = "vectors", idx = 1:20)
plot(fss_electr, type = "series", groups = list(13:16, 1:12))
coef <- c(1 - 0.02, 1 + 0.02)
freq.bins.seas = list(trend = 0.05, s12 = 1/12 * coef, s6 = 1/6 * coef,
s4 = 1/4 * coef, s3 = 1/3 * coef,
s2.4 = 1/2.4 * coef, s2 = 1/2 * coef)
g_electr_fossa <- grouping.auto(fss_electr, base = "series",
freq.bins = freq.bins.seas,
threshold = 0.7, groups = 1:20)
print(g_electr_fossa$trend)
## [1] 13 14 15 16
plot(g_electr_fossa, order = TRUE, type = "b")
r_electr_auto_fossa <- reconstruct(fss_electr, g_electr_fossa)
plot(r_electr_auto_fossa, plot.method = "xyplot", superpose = TRUE,
add.residuals = FALSE)
Немного лучше, но идеально не стало.
Смоделируем полиномиальный ряд со степенью \(4\):
N_poli_4 <- 239
freq_12 <- 12
poli_trend <- 3 * ((1:N_poli_4) / 100) ** 4 - 5.5 * ((1:N_poli_4) / 100) ** 3 - 3 * ((1:N_poli_4) / 100) ** 2 + 1.5 * ((1:N_poli_4) / 100) + 15
ts_poli_4_poli <- ts(poli_trend, frequency = freq_12)
ts_poli_4 <- ts(poli_trend + 3 * cos(2 * pi * (1:N_poli_4) / freq_12 + pi / 3) + rnorm(N_poli_4), frequency = freq_12)
plot(ts_poli_4)
Попробуем выделить тренд обычным SSA:
L_poli_4 <- N_poli_4 %/% 2 %/% 12 * 12
s_poli_4 <- ssa(ts_poli_4, kind = "1d-ssa", svd.method = "svd", L = L_poli_4)
plot(s_poli_4)
plot(s_poli_4, type = "vectors", idx = 1:8)
plot(s_poli_4, type = "series", groups = list(c(1, 2, 5), c(3, 4)))
plot(wcor(s_poli_4, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_poli_4 <- reconstruct(s_poli_4,
groups = list(Trend = c(1, 2, 5), Seasonality = c(3, 4)))
plot(r_poli_4, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
Видим, что тренд смешался с сезонностью или шумом. Теперь попробуем применить SSA с проекциями:
r <- 4
s_poli_4_proj <- ssa(ts_poli_4, L = L_poli_4, column.projector = 3, row.projector = 2)
r_poli_4_proj <- reconstruct(s_poli_4_proj, groups =
list(Trend = seq_len(nspecial(s_poli_4_proj))))
fit_poli_4 <- ts(r_poli_4_proj$Trend, frequency = 12)
fit_poli_4_reg <- ts(predict(lm(fit_poli_4 ~ poly((1:length(ts_poli_4)), r, raw = TRUE))), frequency = 12)
plot(ts_poli_4)
lines(ts_poli_4_poli, col = "red")
lines(fit_poli_4, col = "blue")
lines(fit_poli_4_reg, col = "green")
legend(x = "top", c("original", "poly reg", "SSA proj", "SSA proj reg"), col = c("black", "red", "blue", "green"), lty = 1, lwd = 1)
Видим, что тренд действительно намного лучше выделился и почти совпал с реальным трендом.
Мы видели, что тренд ряда по электричеству неплохо аппроксимировался 4-ой степенью. Попробуем тогда применить SSA с проекциями для выделения полиномиального тренда:
r <- 4
s_electr_proj <- ssa(ts_electr_2, L = L_electr, column.projector = 3, row.projector = 2)
r_electr_proj <- reconstruct(s_electr_proj, groups =
list(Trend = seq_len(nspecial(s_electr_proj))))
fit_electr <- ts(r_electr_proj$Trend, frequency = 12)
fit_electr_reg <- ts(predict(lm(fit_electr ~ poly((1:length(ts_electr_2)), r, raw = TRUE))), frequency = 12)
plot(ts_electr_2)
lines(ts_electr_poly, col = "red")
lines(fit_electr, col = "blue")
lines(fit_electr_reg, col = "green")
legend(x = "bottomright", c("original", "poly reg", "SSA proj", "SSA proj reg"), col = c("black", "red", "blue", "green"), lty = 1, lwd = 1)
Функция, вычисляющая коэффициенты линейного фильтра на основе SSA:
make_filter_ssa <- function(u, L) {
k <- length(u)
mat1 <- t(matrix(rep(u[1:L], each = L), nrow = L))
mat2 <- matrix(0, nrow = L, ncol = L)
for (j in 0:(L - 1)) {
mat2[j + 1, ] <- c(u[(j + 1):L], rep(0, j))
}
res <- colSums(mat1 * mat2)
c(rev(res[-1]), res) / L
}
В результате, если взять точки от \(L\) до \(N - L + 1\), то получим нужный результат:
L <- 26
N <- 169
freq <- 13
s_cos_lin <- ssa(ts_cos_lin, L = L, kind = "1d-ssa", svd.method = "svd")
ts_cos_lin_filter_sum <- ts(rep(0, N), frequency = freq)
color_plot <- c("red", "blue", "green", "purple")
plot(ts_cos_lin)
for (i in 1:4) {
u <- s_cos_lin$U[, i]
filter_ssa <- make_filter_ssa(u, L)
ts_cos_lin_filter <- ts(filter_continue(ts_cos_lin, filter_ssa, FALSE), frequency = freq)
ts_cos_lin_filter_sum <- ts_cos_lin_filter_sum + ts_cos_lin_filter
lines(ts_cos_lin_filter, col = color_plot[i])
}
lines(ts_cos_lin_filter_sum, col = "orange")
abline(v = (N - L) / freq + 1, col = "gray", lty = 2)
abline(v = L / freq + 1, col = "gray", lty = 2)
legend(x = "topleft", c("original", "1", "2", "3", "4", "sum"), col = c("black", color_plot, "orange"), lty = 1, lwd = 1)
Посмотрим на АЧХ фильтра SSA для синуса при разных \(L\):
L_vec <- c(13, 26, 39, 52)
freq <- seq(0, pi, 0.001)
omega <- freq/2/pi
u <- ssa(ts_cos_lin, L = L_vec[1], kind = "1d-ssa", svd.method = "svd")$U[, 3]
plot(omega, afc(make_filter_ssa(u, L_vec[1]), freq), type = "l", ylab = "Frequency response")
u <- ssa(ts_cos_lin, L = L_vec[2], kind = "1d-ssa", svd.method = "svd")$U[, 3]
lines(omega, afc(make_filter_ssa(u, L_vec[2]), freq), type = "l", col = "red")
u <- ssa(ts_cos_lin, L = L_vec[3], kind = "1d-ssa", svd.method = "svd")$U[, 3]
lines(omega, afc(make_filter_ssa(u, L_vec[3]), freq), type = "l", col = "blue")
u <- ssa(ts_cos_lin, L = L_vec[4], kind = "1d-ssa", svd.method = "svd")$U[, 3]
lines(omega, afc(make_filter_ssa(u, L_vec[4]), freq), type = "l", col = "green")
legend(x="topright", c("L = 13", "L = 26", "L = 39", "L = 52"), col = c("black", "red", "blue", "green"), lty = 1, lwd = 1)
Амплитуда, модулированная косинусом с большим периодом:
ts_cos_cos_long <- ts(cos(2 * pi * (1:197) / 100) * cos(2 * pi * (1:197) / 6))
plot(ts_cos_cos_long)
ts_cos_cos_sq <- 2 * ts_cos_cos_long ** 2
plot(ts_cos_cos_sq)
s_cos_cos_sq <- ssa(ts_cos_cos_sq, L = 96, kind = "1d-ssa", svd.method = "svd")
plot(s_cos_cos_sq)
plot(s_cos_cos_sq, type = "vectors", idx = 1:10)
plot(s_cos_cos_sq, type = "series", groups = list(c(1, 2, 5)))
plot(wcor(s_cos_cos_sq, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_cos_cos_sq <- reconstruct(s_cos_cos_sq,
groups = list(Trend = c(1, 2, 5)))
plot(r_cos_cos_sq, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
plot(ts_cos_cos_long)
lines(sqrt(r_cos_cos_sq$Trend))
Амплитуда — корень:
ts_cos_sqrt <- ts(sqrt(1:197) * cos(2 * pi * (1:197) / 6))
plot(ts_cos_sqrt)
ts_cos_sqrt_sq <- 2 * ts_cos_sqrt ** 2
plot(ts_cos_sqrt_sq)
s_cos_sqrt_sq <- ssa(ts_cos_sqrt_sq, L = 96, kind = "1d-ssa", svd.method = "svd")
plot(s_cos_sqrt_sq)
plot(s_cos_sqrt_sq, type = "vectors", idx = 1:10)
plot(s_cos_sqrt_sq, type = "series", groups = list(c(1, 4)))
plot(wcor(s_cos_sqrt_sq, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_cos_sqrt_sq <- reconstruct(s_cos_sqrt_sq,
groups = list(Trend = c(1, 4)))
plot(r_cos_sqrt_sq, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
plot(ts_cos_sqrt)
lines(sqrt(r_cos_sqrt_sq$Trend))
N <- 300
sigma <- (5 + 3 * sin(2 * pi * (1:N) / 100))
ts_getero_noise <- ts(sigma * rnorm(N))
plot(ts_getero_noise)
lines(sigma)
ts_getero_noise_sq <- ts_getero_noise ** 2
plot(ts_getero_noise_sq)
s_getero_noise_sq <- ssa(ts_getero_noise_sq, kind = "1d-ssa", svd.method = "svd")
plot(s_getero_noise_sq)
plot(s_getero_noise_sq, type = "vectors", idx = 1:10)
plot(s_getero_noise_sq, type = "series", groups = list(c(1, 2, 3)))
plot(wcor(s_getero_noise_sq, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_getero_noise_sq <- reconstruct(s_getero_noise_sq,
groups = list(Trend = c(1, 2, 3)))
plot(r_getero_noise_sq, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
rebuild_sigma <- sqrt(r_getero_noise_sq$Trend)
plot(ts_getero_noise)
lines(rebuild_sigma, col = "blue")
lines(-rebuild_sigma, col = "blue")
lines(2 * rebuild_sigma, col = "red")
lines(-2 * rebuild_sigma, col = "red")
Для реального ряда электричества мы получаем, что параметр Box-cox преобразования очень близок к \(0.5\), что похоже на пуассоновский шум:
lambda <- BoxCox.lambda(ts_electr_2, method = "guerrero")
print(lambda)
## [1] 0.5172697
ts_electr_transform <- BoxCox(ts_electr_2, lambda)
plot(ts_electr_transform, type = "l")
s_electr_transform <- ssa(ts_electr_transform, kind = "1d-ssa", svd.method = "svd")
plot(s_electr_transform)
plot(s_electr_transform, type = "vectors", idx = 1:12)
plot(wcor(s_electr_transform, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_electr_transform <- reconstruct(s_electr_transform,
groups = list(Trend = c(1, 6), Seasonality = c(2:5, 7:10)))
plot(r_electr_transform, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
trend_electr_transform <- InvBoxCox(r_electr_transform$Trend, lambda)
plot(ts_electr_2)
lines(trend_electr_transform, col = "red")
lines(r_electr$Trend, col = "blue")
legend(x = "bottomright", legend = c("SSA + BC", "SSA"), lty = 1, col = c("red", "blue"))
Большой разницы нет.
Тренд, полученный SSA, усреднением, аппроксимацией, loess и т.д.:
plot(ts_electr_2)
lines(r_electr$Trend, col="red")
lines(r_electr_auto$trend, col="orange")
lines(r_electr_auto_fossa$trend, col="tomato4")
lines(fit_electr_reg, col="darkcyan")
lines(ts_electr_2_filter_24_2, col="blue")
lines(ts_electr_poly, col="green")
lines(ts_electr_loess_50, col="purple")
lines(ts_electr_hp_1, col="aquamarine")
legend(x = "bottomright", legend = c("SSA", "SSA auto", "FOSSA", "SSA poly reg", "Filter", "Polinom", "Loess", "HP"), lty = 1, col = c("red", "orange", "tomato4", "darkcyan", "blue", "green", "purple", "aquamarine"))
Смоделируем ряд Фибоначчи Линейной реккурентной формулой:
lrr_generate <- function(coef, first_members, n) {
res <- c(first_members)
k <- length(coef)
for (i in (k + 1):n) {
res <- c(res, sum(res[(i - k):(i - 1)] * coef))
}
res
}
N_fib <- 10
first_members_fib <- c(1, 1)
coef_fib <- c(1, 1)
fb_series <- lrr_generate(coef_fib, first_members_fib, N_fib)
cat("Fibonacci original series: ", fb_series, "\n")
## Fibonacci original series: 1 1 2 3 5 8 13 21 34 55
Теперь попробуем обратно из ряда получить коэффициенты ЛРФ. Для чего сначала составим траекторную матрицу:
create_tr_matrix <- function(series, L) {
N <- length(series)
K <- N - L + 1
X <- matrix(nrow = L, ncol = K)
for (i in 1:L) {
for (j in 1:K) {
X[i, j] <- series[j + i - 1]
}
}
X
}
L_fib <- 5
X_fib <- create_tr_matrix(fb_series, L = L_fib)
X_fib
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 1 2 3 5 8
## [2,] 1 2 3 5 8 13
## [3,] 2 3 5 8 13 21
## [4,] 3 5 8 13 21 34
## [5,] 5 8 13 21 34 55
Наконец найдём вектор \(B\) (ортогональный пространству столбцов матрицы, и последний элемент который равен \(-1\)):
find_min_norm_null_vector <- function(X) {
X <- t(X)
svd_res <- svd(X)
V <- svd_res$v
small_singular <- which(svd_res$d < sqrt(.Machine$double.eps))
if (length(small_singular) == 0) {
stop("У матрицы нет ненулевого нулевого пространства")
}
Null_basis <- V[, small_singular, drop = FALSE]
dim_null <- ncol(Null_basis)
Dmat <- t(Null_basis) %*% Null_basis
dvec <- rep(0, dim_null)
Aeq <- matrix(0, nrow = 1, ncol = dim_null)
Aeq[1, ] <- Null_basis[nrow(Null_basis), ]
beq <- -1
sol <- solve.QP(Dmat, dvec, t(Aeq), beq, meq = 1)
B <- Null_basis %*% sol$solution
B
}
B_fib <- find_min_norm_null_vector(X_fib)
cat("B:\n")
## B:
print(B_fib)
## [,1]
## [1,] 0.3333333
## [2,] 0.3333333
## [3,] 0.6666667
## [4,] 1.0000000
## [5,] -1.0000000
Проверим насколько эти коэффициенты соответствуют нашему ряду:
coef_fib_re <- B_fib[1:(length(B_fib) - 1)]
fb_series_re <- lrr_generate(coef_fib_re, fb_series[1:length(coef_fib_re)], N_fib)
cat("Fibonacci restored series: ", fb_series_re, "\n")
## Fibonacci restored series: 1 1 2 3 5 8 13 21 34 55
Соответствуют полностью!
Используем характеристические функции для поиска явной формулы ряда:
cluster_complex_numbers <- function(values, epsilon = 1e-4) {
n <- length(values)
adj_matrix <- matrix(0, n, n)
if (n > 1) {
for (i in 1:(n - 1)) {
for (j in (i+1):n) {
if (Mod(values[i] - values[j]) <= epsilon) {
adj_matrix[i, j] <- 1
adj_matrix[j, i] <- 1
}
}
}
}
g <- graph_from_adjacency_matrix(adj_matrix, mode = "undirected")
clusters <- components(g)$membership
unique_clusters <- unique(clusters)
results <- list()
for (cl in unique_clusters) {
group_values <- values[clusters == cl]
mean_value <- mean(group_values)
group_size <- length(group_values)
results <- append(results, list(list(mean = mean_value, size = group_size)))
}
return(results)
}
find_explicit_formula <- function(coeffs, initial_terms) {
k <- length(coeffs)
poly_roots <- polyroot(c(-coeffs, 1))
roots_group <- cluster_complex_numbers(poly_roots)
basis <- matrix(0, nrow = k, ncol = k)
for (i in 1:k) {
t <- 1
for (elem in roots_group) {
roots_group_size <- elem$size
roots_group_mean <- elem$mean
for (l in 1:roots_group_size) {
basis[i, t] <- i ** (l - 1) * roots_group_mean ** i
t <- t + 1
}
}
}
A <- ginv(basis) %*% initial_terms
t <- 1
for (j in 1:length(roots_group)) {
roots_group_size <- roots_group[[j]]$size
roots_group[[j]]$coef <- A[t:(t + roots_group_size - 1)]
t <- t + roots_group_size
}
roots_group
}
list_formula_fib <- find_explicit_formula(coef_fib, first_members_fib)
list_formula_fib
## [[1]]
## [[1]]$mean
## [1] -0.618034-1.722177e-18i
##
## [[1]]$size
## [1] 1
##
## [[1]]$coef
## [1] -0.4472136-7.701809e-19i
##
##
## [[2]]
## [[2]]$mean
## [1] 1.618034+1.722177e-18i
##
## [[2]]$size
## [1] 1
##
## [[2]]$coef
## [1] 0.4472136-1.568657e-18i
Как это выглядит в виде математической формулы:
round_complex <- function(x, digits = 4) {
round(Re(x), digits) + round(Im(x), digits) * 1i
}
create_str_formula <- function(list_formula) {
res <- "x _n = "
first_plus <- TRUE
for (elem in list_formula) {
if (first_plus) {
first_plus <- FALSE
}
else {
res <- paste(res, " + ", sep = "")
}
list_formula_size <- elem$size
list_formula_mean <- elem$mean
list_formula_coef <- elem$coef
res <- paste(res, "(", sep = "")
for (i in 1:list_formula_size) {
if (i != 1) {
res <- paste(res, " + ", sep = "")
}
res <- paste(res, "(", toString(round_complex(list_formula_coef[i])), ")", sep = "")
if (i != 1){
res <- paste(res, " * n", sep = "")
}
if (i > 2) {
res <- paste(res, " ^(", toString(i - 1), ")", sep = "")
}
}
res <- paste(res, ") * (", toString(round_complex(list_formula_mean)), ") ^n", sep = "")
}
res
}
create_str_formula(list_formula_fib)
## [1] "x _n = ((-0.4472+0i)) * (-0.618+0i) ^n + ((0.4472+0i)) * (1.618+0i) ^n"
Наконец, заново смоделируем числа Фибоначчи с помощью явной формулы:
model_elem_by_formula <- function(list_formula, n) {
res <- 0
for (elem in list_formula) {
list_formula_size <- elem$size
list_formula_mean <- elem$mean
list_formula_coef <- elem$coef
brace_sum <- 0
for (i in 1:list_formula_size) {
brace_sum <- brace_sum + list_formula_coef[i] * n ** (i - 1)
}
res <- res + brace_sum * list_formula_mean ** n
}
res
}
model_series_by_formula <- function(list_formula, size) {
res <- c()
for (i in 1:size) {
res <- c(res, Re(model_elem_by_formula(list_formula, i)))
}
res
}
fb_series_formula <- model_series_by_formula(list_formula_fib, N_fib)
fb_series_formula
## [1] 1 1 2 3 5 8 13 21 34 55
Сделаем то же самое с косинусом:
N_cos <- 10
cos_series_orig <- cos(2 * pi * (1:N_cos) / freq_12)
first_members_cos <- c(cos(2 * pi / freq_12), cos(4 * pi / freq_12))
coef_cos <- c(-1, 2 * cos(2 * pi / freq_12))
coef_cos
## [1] -1.000000 1.732051
cos_series <- lrr_generate(coef_cos, first_members_cos, N_cos)
list_formula_cos <- find_explicit_formula(coef_cos, first_members_cos)
create_str_formula(list_formula_cos)
## [1] "x _n = ((0.5+0i)) * (0.866+0.5i) ^n + ((0.5+0i)) * (0.866-0.5i) ^n"
cos_series_formula <- model_series_by_formula(list_formula_cos, N_cos)
L_cos <- 5
X_cos <- create_tr_matrix(cos_series, L = L_cos)
X_cos
## [,1] [,2] [,3] [,4] [,5]
## [1,] 8.660254e-01 5.000000e-01 2.220446e-16 -0.5000000 -8.660254e-01
## [2,] 5.000000e-01 2.220446e-16 -5.000000e-01 -0.8660254 -1.000000e+00
## [3,] 2.220446e-16 -5.000000e-01 -8.660254e-01 -1.0000000 -8.660254e-01
## [4,] -5.000000e-01 -8.660254e-01 -1.000000e+00 -0.8660254 -5.000000e-01
## [5,] -8.660254e-01 -1.000000e+00 -8.660254e-01 -0.5000000 -8.881784e-16
## [,6]
## [1,] -1.000000e+00
## [2,] -8.660254e-01
## [3,] -5.000000e-01
## [4,] -8.881784e-16
## [5,] 5.000000e-01
B_cos <- find_min_norm_null_vector(X_cos)
cat("B:\n")
## B:
print(B_cos)
## [,1]
## [1,] -0.5384615
## [2,] -0.1332347
## [3,] 0.3076923
## [4,] 0.6661734
## [5,] -1.0000000
coef_cos_re <- B_cos[1:(length(B_cos) - 1)]
cos_series_re <- lrr_generate(coef_cos_re, cos_series[1:length(coef_cos_re)], N_cos)
cat("Cos restored series: ", cos_series_re, "\n")
## Cos restored series: 0.8660254 0.5 2.220446e-16 -0.5 -0.8660254 -1 -0.8660254 -0.5 -2.498002e-16 0.5
plot(cos_series_orig, type = "l", col = "red")
lines(cos_series, lty = 2, lwd = 2, col = "blue")
lines(cos_series_re, lty = 6, lwd = 2, col = "purple")
lines(cos_series_formula, lty = 3, lwd = 2, col = "green")
legend(x="top", legend = c("original", "lrr_generate", "restored", "formula"), lty = c(1, 2, 6, 3), col = c("red", "blue", "purple", "green"))
Сгенерируем сложный ряд с экспонентой, полиномом и косинусом:
N_epc <- 30
epc_series <- exp(2 * (1:N_epc) / N_epc) * (2 *(2 * (1:N_epc) / N_epc) ** 2 - 3 * (2 * (1:N_epc) / N_epc) + 6) * cos(2 * pi * (1:N_epc) / freq_12 + pi / 3)
epc_series
## [1] 3.802129e-16 -3.219680e+00 -5.796557e+00 -6.974833e+00 -6.311765e+00
## [6] -3.819071e+00 -1.475096e-15 4.234996e+00 7.763766e+00 9.522255e+00
## [11] 8.790981e+00 5.430320e+00 3.570528e-15 -6.283965e+00 -1.177051e+01
## [16] -1.474793e+01 -1.390363e+01 -8.765109e+00 -8.227981e-15 1.053797e+01
## [21] 2.008810e+01 2.558473e+01 2.448707e+01 1.565158e+01 -4.252692e-14
## [26] -1.926741e+01 -3.709317e+01 -4.765243e+01 -4.594904e+01 -2.955622e+01
Найдём ЛРФ этого ряда:
L_epc <- 7
X_epc <- create_tr_matrix(epc_series, L = L_epc)
B_epc <- find_min_norm_null_vector(X_epc)
cat("B:\n")
## B:
print(B_epc)
## [,1]
## [1,] -1.491825
## [2,] 7.251815
## [3,] -15.667262
## [4,] 19.039785
## [5,] -13.711570
## [6,] 5.554371
## [7,] -1.000000
coef_epc_re <- B_epc[1:(length(B_epc) - 1)]
first_members_epc <- epc_series[1:length(coef_epc_re)]
epc_series_re <- lrr_generate(coef_epc_re, first_members_epc, N_epc)
cat("Restored series: ", epc_series_re, "\n")
## Restored series: 3.802129e-16 -3.21968 -5.796557 -6.974833 -6.311765 -3.819071 -3.609613e-14 4.234996 7.763766 9.522255 8.790981 5.43032 1.601264e-11 -6.283965 -11.77051 -14.74793 -13.90363 -8.765109 -5.845595e-11 10.53797 20.0881 25.58473 24.48707 15.65158 1.42671e-10 -19.26741 -37.09317 -47.65243 -45.94904 -29.55622
А теперь по ЛРФ найдём обратно явную формулу:
list_formula_epc <- find_explicit_formula(coef_epc_re, first_members_epc)
create_str_formula(list_formula_epc)
## [1] "x _n = ((1.5+2.5981i) + (-0.05-0.0866i) * n + (0.0022+0.0038i) * n ^(2)) * (0.9257+0.5345i) ^n + ((1.5-2.5981i) + (-0.05+0.0866i) * n + (0.0022-0.0038i) * n ^(2)) * (0.9257-0.5345i) ^n"
epc_series_formula <- model_series_by_formula(list_formula_epc, N_epc)
Видим, что всё совпадает:
plot(epc_series, type = "l", col = "red")
lines(epc_series_re, lty = 6, lwd = 2, col = "purple")
lines(epc_series_formula, lty = 3, lwd = 2, col = "green")
legend(x="top", legend = c("original", "lrr_generate", "formula"), lty = c(1, 6, 3), col = c("red", "purple", "green"))
До этого найденные формулы имели комплексные числа в своей записи, хотя выдавали вещественные результаты, то есть по сути своей составляли вещественные выражения в комплексных значениях. Перепишим прошлые функции так, чтобы они выдавали суммы произведений косинусов, степеней и полиномов.
Функция кластеризации комплексных чисел почти не изменилась:
cluster_complex_numbers_conj <- function(values, epsilon = 1e-3) {
n <- length(values)
adj_matrix <- matrix(0, n, n)
if (n > 1) {
for (i in 1:(n - 1)) {
for (j in (i+1):n) {
if (Mod(values[i] - values[j]) / max(Mod(values[i]), abs(Mod(values[j]))) <= epsilon) {
adj_matrix[i, j] <- 1
adj_matrix[j, i] <- 1
}
}
}
}
g <- graph_from_adjacency_matrix(adj_matrix, mode = "undirected")
clusters <- components(g)$membership
unique_clusters <- unique(clusters)
results <- list()
for (cl in unique_clusters) {
group_values <- values[clusters == cl]
mean_value <- mean(group_values)
group_size <- length(group_values)
mod_value <- Mod(mean_value)
arg_value <- Arg(mean_value)
period_value <- 2 * pi / arg_value
results <- append(results, list(list(mean = mean_value,
period = period_value,
arg = arg_value,
mod = mod_value,
size = group_size)))
}
results$type <- "raw"
results
}
В функции нахождения явной формулы изменим нахождение корней через polyroot на метод ssa, а также для периодик перепишим нахождение коэффициентов:
find_roots_simple <- function(series) {
len_series <- length(series)
s_series_pre <- ssa(series, L = len_series %/% 2, svd.method = "svd")
r <- sum(s_series_pre$sigma > sqrt(.Machine$double.eps))
s_series <- ssa(series, L = r + 1, svd.method = "svd")
l_series <- lrr(s_series, groups = list(1:r))
roots(l_series)
}
find_explicit_formula_conj <- function(series, roots = c(), epsilon = 1e-3) {
len_series <- length(series)
if (length(roots) == 0)
roots_series <- find_roots_simple(series)
else
roots_series <- roots
r <- length(roots_series)
roots_group <- cluster_complex_numbers_conj(roots_series, epsilon)
basis <- matrix(0, nrow = len_series, ncol = r)
first_to_len <- 1:len_series
t <- 1
for (i in 1:(length(roots_group) - 1)) {
elem <- roots_group[[i]]
roots_group_size <- elem$size
roots_group_mod <- elem$mod
roots_group_period <- elem$period
if (roots_group_period > 1e16)
powers <- roots_group_mod ** first_to_len
else if (roots_group_period == 2)
powers <- (-roots_group_mod) ** first_to_len
else if (roots_group_period > 0)
powers <- roots_group_mod ** first_to_len * sin(2 * pi * first_to_len / roots_group_period)
else
powers <- roots_group_mod ** first_to_len * cos(2 * pi * first_to_len / roots_group_period)
for (l in 1:roots_group_size) {
basis[first_to_len, t] <- first_to_len ** (l - 1) * powers
t <- t + 1
}
}
A <- lm(series ~ 0 + basis)
A <- A$coefficients
names(A) <- c()
t <- 1
for (j in 1:(length(roots_group) - 1)) {
roots_group_size <- roots_group[[j]]$size
roots_group[[j]]$coef <- A[t:(t + roots_group_size - 1)]
t <- t + roots_group_size
}
roots_group
}
Аналогичные изменения периодик внесём в функции нахождения строки и моделирования ряда:
create_str_formula_conj <- function(list_formula, digits = 4) {
res <- "x _n = "
first_plus <- TRUE
for (i in 1:(length(list_formula) - 1)) {
elem <- list_formula[[i]]
if (first_plus) {
first_plus <- FALSE
}
else {
res <- paste(res, " + ", sep = "")
}
list_formula_coef <- elem$coef
list_formula_size <- elem$size
list_formula_mod <- elem$mod
list_formula_period <- elem$period
res <- paste(res, "(", sep = "")
for (i in 1:list_formula_size) {
if (i != 1) {
res <- paste(res, " + (", sep = "")
}
res <- paste(res, toString(round(list_formula_coef[i], digits)), sep = "")
if (i != 1){
res <- paste(res, ") * n", sep = "")
}
if (i > 2) {
res <- paste(res, " ^(", toString(i - 1), ")", sep = "")
}
}
if (abs(list_formula_period) > 1e16)
res <- paste(res, ") * ", toString(round(list_formula_mod, digits)), " ^n", sep = "")
else if (list_formula_period == 2)
res <- paste(res, ") * (", toString(-round(list_formula_mod, digits)), ") ^n", sep = "")
else {
res <- paste(res, ") * ", sep = "")
if (round(list_formula_mod, digits) != 1) {
res <- paste(res, toString(round(list_formula_mod, digits))," ^n * ", sep = "")
}
if (list_formula_period > 0) {
if (list_formula$type == "raw") {
res <- paste(res, "sin(2 * pi * n / ", toString(round(list_formula_period, digits)), ")", sep = "")
}
else if (list_formula$type == "compressed") {
res <- paste(res, "cos(2 * pi * n / ", toString(round(list_formula_period, digits)), " + (", toString(round(elem$phi, digits)), "))", sep = "")
}
}
else
res <- paste(res, "cos(2 * pi * n / ", toString(abs(round(list_formula_period, digits))), ")", sep = "")
}
}
res
}
model_elem_by_formula_conj <- function(list_formula, n) {
res <- 0
for (i in 1:(length(list_formula) - 1)) {
elem <- list_formula[[i]]
list_formula_coef <- elem$coef
list_formula_size <- elem$size
list_formula_mod <- elem$mod
list_formula_period <- elem$period
brace_sum <- 0
for (i in 1:list_formula_size) {
brace_sum <- brace_sum + list_formula_coef[i] * n ** (i - 1)
}
if (abs(list_formula_period) > 1e16)
res <- res + brace_sum * list_formula_mod ** n
else if (list_formula_period == 2)
res <- res + brace_sum * (-list_formula_mod) ** n
else if (list_formula_period > 0) {
if (list_formula$type == "raw") {
res <- res + brace_sum * list_formula_mod ** n * sin(2 * pi * n / list_formula_period)
}
else if (list_formula$type == "compressed") {
res <- res + brace_sum * list_formula_mod ** n * cos(2 * pi * n / list_formula_period + elem$phi)
}
}
else
res <- res + brace_sum * list_formula_mod ** n * cos(2 * pi * n / list_formula_period)
}
res
}
model_series_by_formula_conj <- function(list_formula, size) {
res <- c()
for (i in 1:size) {
res <- c(res, Re(model_elem_by_formula_conj(list_formula, i)))
}
res
}
Функция нахождения явной формулы не объединяет синусы и косинусы с одинаковым периодом, поэтому сделаем это в отдельной формуле:
compress_sincos_to_cos <- function(list_formula) {
list_formula_cos <- list()
for (i in 1:(length(list_formula) - 1)) {
elem <- list_formula[[i]]
period <- elem$period
coef <- elem$coef[1]
if (elem$period < 0 & elem$period > -1e16) {
for (j in 1:(length(list_formula) - 1)) {
elem1 <- list_formula[[j]]
if (abs(abs(period) - elem1$period) < 1e-8) {
coef1 <- elem1$coef[1]
phi <- -atan2(coef1, coef)
coef_new <- elem$coef / cos(phi)
list_formula_cos <- append(list_formula_cos, list(list(mean = elem$mean,
period = abs(period),
arg = elem$arg,
mod = elem$mod,
size = elem$size,
coef = coef_new,
phi = phi)))
}
}
}
else if (elem$period == 2 | abs(elem$period) > 1e16) {
list_formula_cos <- append(list_formula_cos, list(list(mean = elem$mean,
period = period,
arg = elem$arg,
mod = elem$mod,
size = elem$size,
coef = elem$coef,
phi = 0)))
}
}
list_formula_cos$type <- "compressed"
list_formula_cos
}
В качестве примера рассмотрим прошлый сложный ряд и ещё прибавим к нему косинус с другим периодом и смещением: \[ x _n = (6 - 0.2 n + 0.0089 n ^2) \cdot e ^{2n / 30} \cos(2 \pi n / 12 + \pi / 3) + \cos(2 \pi n / 5 - \pi / 4) \]
epc_sin_series <- epc_series + cos(2 * pi * (1:N_epc) / 5 - pi / 4)
plot(epc_sin_series, type = "l")
Найдём явную формулу в виде списка параметров:
list_formula_epc_sin <- find_explicit_formula_conj(epc_sin_series)
list_formula_epc_sin
## [[1]]
## [[1]]$mean
## [1] 0.9257284+0.5344696i
##
## [[1]]$period
## [1] 12
##
## [[1]]$arg
## [1] 0.5235988
##
## [[1]]$mod
## [1] 1.068939
##
## [[1]]$size
## [1] 3
##
## [[1]]$coef
## [1] -5.196152423 0.173205081 -0.007698004
##
##
## [[2]]
## [[2]]$mean
## [1] 0.9257284-0.5344696i
##
## [[2]]$period
## [1] -12
##
## [[2]]$arg
## [1] -0.5235988
##
## [[2]]$mod
## [1] 1.068939
##
## [[2]]$size
## [1] 3
##
## [[2]]$coef
## [1] 3.000000000 -0.100000000 0.004444444
##
##
## [[3]]
## [[3]]$mean
## [1] 0.309017+0.9510565i
##
## [[3]]$period
## [1] 5
##
## [[3]]$arg
## [1] 1.256637
##
## [[3]]$mod
## [1] 1
##
## [[3]]$size
## [1] 1
##
## [[3]]$coef
## [1] 0.7071068
##
##
## [[4]]
## [[4]]$mean
## [1] 0.309017-0.9510565i
##
## [[4]]$period
## [1] -5
##
## [[4]]$arg
## [1] -1.256637
##
## [[4]]$mod
## [1] 1
##
## [[4]]$size
## [1] 1
##
## [[4]]$coef
## [1] 0.7071068
##
##
## $type
## [1] "raw"
create_str_formula_conj(list_formula_epc_sin)
## [1] "x _n = (-5.1962 + (0.1732) * n + (-0.0077) * n ^(2)) * 1.0689 ^n * sin(2 * pi * n / 12) + (3 + (-0.1) * n + (0.0044) * n ^(2)) * 1.0689 ^n * cos(2 * pi * n / 12) + (0.7071) * sin(2 * pi * n / 5) + (0.7071) * cos(2 * pi * n / 5)"
И сократим запись убрав синусы и добавив смещение:
list_formula_epc_sin_comp <- compress_sincos_to_cos(list_formula_epc_sin)
list_formula_epc_sin_comp
## [[1]]
## [[1]]$mean
## [1] 0.9257284-0.5344696i
##
## [[1]]$period
## [1] 12
##
## [[1]]$arg
## [1] -0.5235988
##
## [[1]]$mod
## [1] 1.068939
##
## [[1]]$size
## [1] 3
##
## [[1]]$coef
## [1] 6.000000000 -0.200000000 0.008888889
##
## [[1]]$phi
## [1] 1.047198
##
##
## [[2]]
## [[2]]$mean
## [1] 0.309017-0.9510565i
##
## [[2]]$period
## [1] 5
##
## [[2]]$arg
## [1] -1.256637
##
## [[2]]$mod
## [1] 1
##
## [[2]]$size
## [1] 1
##
## [[2]]$coef
## [1] 1
##
## [[2]]$phi
## [1] -0.7853982
##
##
## $type
## [1] "compressed"
create_str_formula_conj(list_formula_epc_sin_comp)
## [1] "x _n = (6 + (-0.2) * n + (0.0089) * n ^(2)) * 1.0689 ^n * cos(2 * pi * n / 12 + (1.0472)) + (1) * cos(2 * pi * n / 5 + (-0.7854))"
Полученная формула аналогична оригинальной. По найденной формуле построим ряд и сравним с оригиналом:
epc_sin_series_formula <- model_series_by_formula_conj(list_formula_epc_sin_comp, N_epc)
plot(epc_sin_series, type = "l", col = "red")
lines(epc_sin_series_formula, lty = 3, lwd = 2, col = "green")
legend(x="bottom", legend = c("original", "formula (compressed)"), lty = c(1, 3), col = c("red", "green"))
Как и ожидалось, они совпадают.
Давайте внесём немного шума в ряд. Для этого возьмём для начала ряд косинуса плюс экспонента:
N_cos_long <- 100
cos_series_long <- exp(2 * (1:N_cos_long) / N_cos_long) + cos(2 * pi * (1:N_cos_long) / 12 + pi / 3)
plot(cos_series_long, type = "l")
Внесём шум:
cos_series_long_noise <- cos_series_long + rnorm(N_cos_long, 0, 1)
plot(cos_series_long_noise, type = "l")
Посмотрим, что выдаст прошлый алгоритм. Но в первую очередь на вывод lrr после того как мы передадим ему ряд с группами найденного сигнала:
L_cos_long <- (N_cos_long %/% 2) %/% 12 * 12
s_cos_series_long_noise <- ssa(cos_series_long_noise, L = L_cos_long, svd.method = "svd")
plot(s_cos_series_long_noise, type = "vectors", idx = 1:10)
plot(s_cos_series_long_noise, type = "series", groups = list(c(1, 2, 3)))
plot(wcor(s_cos_series_long_noise, groups = 1:10),
scales = list(at = c(10, 20, 30)))
r_cos_series_long_noise <- reconstruct(s_cos_series_long_noise,
groups = list(Trend = 1, Seasonality = c(2, 3)))
plot(r_cos_series_long_noise, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
l_cos_series_long_noise <- lrr(s_cos_series_long_noise, groups = list(1:3))
plot(l_cos_series_long_noise)
Видим очень много корней, но есть те которые выделяются на фоне других. С помощью функции parestimate мы можем найти сигнальные корни из всех остальных:
par_cos_long_noise <- parestimate(s_cos_series_long_noise, groups = list(1:3), method = "esprit")
par_cos_long_noise
## period rate | Mod Arg | Re Im
## Inf 0.018919 | 1.01910 0.00 | 1.01910 0.00000
## 12.041 -0.001495 | 0.99851 0.52 | 0.86561 0.49773
## -12.041 -0.001495 | 0.99851 -0.52 | 0.86561 -0.49773
Осталось только передать эти корни в нашу функцию (которую, правда, надо под это переделать):
list_formula_cos_long_noise <- find_explicit_formula_conj(cos_series_long_noise, roots = par_cos_long_noise$roots)
list_formula_cos_long_noise_comp <- compress_sincos_to_cos(list_formula_cos_long_noise)
create_str_formula_conj(list_formula_cos_long_noise_comp)
## [1] "x _n = (1.0217) * 1.0191 ^n + (1.2642) * 0.9985 ^n * cos(2 * pi * n / 12.0406 + (1.1079))"
cos_series_long_noise_formula <- model_series_by_formula_conj(list_formula_cos_long_noise_comp, 100)
plot(cos_series_long_noise, type = "l", col = "red")
lines(cos_series_long, lty = 6, lwd = 2, col = "purple")
lines(cos_series_long_noise_formula, lty = 3, lwd = 2, col = "green")
legend(x="bottom", legend = c("original + noise", "original", "formula (compressed)"), lty = c(1, 6, 3), col = c("red", "purple", "green"))
Применим итерации Cadzow к нашему сложному модельному ряду:
series <- cos_series_long_noise
N <- length(series)
L <- L_cos_long
K <- N - L + 1
rank <- 3
s0 <- ssa(series, L = L)
r0 <- reconstruct(s0, groups = list(signal = 1:rank))$signal
alpha <- 0.1
weights <- numeric(K)
weights[1:K] <- alpha
weights[seq(from = K, to = 1, by = -L)] <- 1
s <- ssa(series, L = L, column.oblique = "identity",
row.oblique = weights, decompose.force = FALSE)
c <- cadzow(s, rank = rank)
sc <- ssa(c, L = rank + 1)
rc <- reconstruct(sc, groups = list(signal = 1:rank))$signal
plot(series, type = "l")
lines(cos_series_long, col = "purple")
lines(r0, col = "blue")
lines(rc, col = "red", lty = 2)
legend(x = "topleft", c("original + noise", "original", "ssa", "cadzow"), col = c("black", "purple", "blue", "red"), lty = c(1, 1, 1, 2))
Видно, что он немного отличается от ssa, причём не в лучшую сторону. Посмотрим на формулу:
list_formula_cos_long_noise_c <- find_explicit_formula_conj(rc, parestimate(sc, groups = list(1:rank), method = "esprit")$roots)
list_formula_cos_long_noise_c_comp <- compress_sincos_to_cos(list_formula_cos_long_noise_c)
cat("Original:", create_str_formula_conj(list_formula_cos_long_noise_comp), "\n")
## Original: x _n = (1.0217) * 1.0191 ^n + (1.2642) * 0.9985 ^n * cos(2 * pi * n / 12.0406 + (1.1079))
cat("Cadzow:", create_str_formula_conj(list_formula_cos_long_noise_c_comp))
## Cadzow: x _n = (1.0403) * 1.019 ^n + (1.3107) * 0.9981 ^n * cos(2 * pi * n / 11.9986 + (1.0071))
Она значительно поменялась.
Вспомним ряд электричества и найдём для него сигнальные корни:
plot(s_electr, type = "series", groups = list(c(1, 6), c(2:5, 7:10)))
l_electr <- lrr(s_electr, groups = list(1:20))
plot(l_electr)
Берём определённую группу, чтобы не вносить шум в формулу:
par_electr <- parestimate(s_electr, groups = list(c(1:10, 14)), method = "esprit")
par_electr
## period rate | Mod Arg | Re Im
## Inf 0.000948 | 1.00095 -0.00 | 1.00095 -0.00000
## 12.014 -0.000249 | 0.99975 0.52 | 0.86610 0.49936
## -12.014 -0.000249 | 0.99975 -0.52 | 0.86610 -0.49936
## 5.996 -0.000659 | 0.99934 1.05 | 0.49900 0.86584
## -5.996 -0.000659 | 0.99934 -1.05 | 0.49900 -0.86584
## 3.999 -0.001616 | 0.99839 1.57 | -0.00028 0.99839
## -3.999 -0.001616 | 0.99839 -1.57 | -0.00028 -0.99839
## 2.399 -0.003373 | 0.99663 2.62 | -0.86382 0.49709
## -2.399 -0.003373 | 0.99663 -2.62 | -0.86382 -0.49709
## Inf -0.016668 | 0.98347 -0.00 | 0.98347 -0.00000
## Inf -0.045323 | 0.95569 0.00 | 0.95569 0.00000
По корням строим формулу:
list_formula_electr <- find_explicit_formula_conj(ts_electr_2, roots = par_electr$roots)
list_formula_electr_comp <- compress_sincos_to_cos(list_formula_electr)
create_str_formula_conj(list_formula_electr)
## [1] "x _n = (112739.0862) * 1.0009 ^n + (-8976.8651) * 0.9998 ^n * sin(2 * pi * n / 12.0135) + (-7280.0665) * 0.9998 ^n * cos(2 * pi * n / 12.0135) + (12884.5383) * 0.9993 ^n * sin(2 * pi * n / 5.9956) + (1856.4944) * 0.9993 ^n * cos(2 * pi * n / 5.9956) + (-1088.3709) * 0.9984 ^n * sin(2 * pi * n / 3.9993) + (3749.2924) * 0.9984 ^n * cos(2 * pi * n / 3.9993) + (4316.0924) * 0.9966 ^n * sin(2 * pi * n / 2.3987) + (-422.6512) * 0.9966 ^n * cos(2 * pi * n / 2.3987) + (998.5436) * 0.9835 ^n + (-45342.0133) * 0.9557 ^n"
create_str_formula_conj(list_formula_electr_comp)
## [1] "x _n = (112739.0862) * 1.0009 ^n + (11557.8317) * 0.9998 ^n * cos(2 * pi * n / 12.0135 + (2.2522)) + (13017.5996) * 0.9993 ^n * cos(2 * pi * n / 5.9956 + (-1.4277)) + (3904.0677) * 0.9984 ^n * cos(2 * pi * n / 3.9993 + (0.2825)) + (4336.737) * 0.9966 ^n * cos(2 * pi * n / 2.3987 + (-1.6684)) + (998.5436) * 0.9835 ^n + (-45342.0133) * 0.9557 ^n"
ts_electr_formula <- ts(model_series_by_formula_conj(list_formula_electr_comp, length(ts_electr_2)), frequency = 12)
plot(ts_electr_2, type = "l", col = "red")
lines(r_electr$Trend + r_electr$Seasonality, lty = 6, lwd = 2, col = "purple")
lines(ts_electr_formula, lty = 3, lwd = 2, col = "green")
legend(x="bottom", legend = c("original", "ssa", "formula (compressed)"), lty = c(1, 3), col = c("red", "purple", "green"))
Видим, что она получилась достаточно похожей на то, что выделил SSA.
Проверим, что если перевернуть ряд, то несигнальные корни останутся внутри единичного круга, а сигнальные поменяют расположение. Возьмём наш сложный ряд и добавим туда экспоненту с отрицательным показателем:
N_epc <- length(epc_series)
L_epc_exp <- 8
epc_series_exp <- epc_series + exp(-4 * (1:N_epc) / N_epc)
plot(epc_series_exp, type = "l")
s_epc_series_exp <- ssa(epc_series_exp, L = N_epc %/% 2, svd.method = "svd")
l_epc_series_exp <- lrr(s_epc_series_exp, groups = list(1:(L_epc_exp - 1)))
plot(l_epc_series_exp)
А теперь перевёрнутый
s_epc_series_exp_rev <- ssa(rev(epc_series_exp), L = N_epc %/% 2, svd.method = "svd")
l_epc_series_exp_rev <- lrr(s_epc_series_exp_rev, groups = list(1:(L_epc_exp - 1)))
plot(l_epc_series_exp_rev)
Действительно корни по модулю меньшие нуля стали больше нуля, а большие — меньше.
Применим IOSSA к ряду косинуса плюс полином второй степени:
N_cos_poly <- 100
ts_cos_12 <- ts(cos(2 * pi * (1:N_cos_poly) / 12), frequency = 12)
ts_poli_2 <- ts(5 * (2 * (1:N_cos_poly) / N_cos_poly) ** 2 - 3 * ((1:N_cos_poly) / N_cos_poly) + 10, frequency = 12)
ts_cos_poly <- ts_cos_12 + ts_poli_2
plot(ts_cos_poly)
s_cos_poly <- ssa(ts_cos_poly, L = (N_cos_poly %/% 2) %/% 12 * 12, kind = "1d-ssa", svd.method = "svd")
plot(s_cos_poly, type = "vectors", idx = 1:10)
r_cos_poly <- reconstruct(s_cos_poly,
groups = list(Trend = c(1, 4, 5), Seasonality = 2:3))
s_cos_poly_i <- iossa(s_cos_poly, nested.groups = list(c(1, 4, 5), c(2, 3)))
plot(s_cos_poly_i, type = "vectors", idx = 1:10)
## Warning in .contribution(x, idx, ...): Elementary matrices are not F-orthogonal
## (max F-cor is -0.0102). Contributions can be irrelevant
r_cos_poly_i <- reconstruct(s_cos_poly_i,
groups = list(Trend = 1:3, Seasonality = 4:5))
plot(r_cos_poly$Trend)
lines(r_cos_poly_i$Trend, col = "red")
Применим EOSSA к ряду косинуса плюс полином второй степени:
plot(ts_cos_poly)
s_cos_poly <- ssa(ts_cos_poly, L = (N_cos_poly %/% 2) %/% 12 * 12, kind = "1d-ssa", svd.method = "svd")
plot(s_cos_poly, type = "vectors", idx = 1:10)
r_cos_poly <- reconstruct(s_cos_poly,
groups = list(Trend = c(1, 4, 5), Seasonality = 2:3))
s_cos_poly_e <- eossa(s_cos_poly, nested.groups = list(c(1, 4, 5), c(2, 3)))
plot(s_cos_poly_e, type = "vectors", idx = 1:10)
## Warning in .contribution(x, idx, ...): Elementary matrices are not F-orthogonal
## (max F-cor is 0.0178). Contributions can be irrelevant
r_cos_poly_e <- reconstruct(s_cos_poly_e,
groups = list(Trend = 1:3, Seasonality = 4:5))
plot(ts_poli_2)
lines(r_cos_poly$Trend, col = "red", lty = 2, lwd = 2)
lines(r_cos_poly_e$Trend, col = "blue", lty = 3, lwd = 2)
legend(x = "topleft", c("original poli", "SSA trend", "EOSSA trend"), col = c("black", "red", "blue"), lty = c(1, 2, 3), lwd = c(1, 2, 2))
plot(ts_cos_12)
lines(r_cos_poly$Seasonality, col = "red", lty = 2, lwd = 2)
lines(r_cos_poly_e$Seasonality, col = "blue", lty = 3, lwd = 2)
legend(x = "topleft", c("original cos", "SSA seasonality", "EOSSA seasonality"), col = c("black", "red", "blue"), lty = c(1, 2, 3), lwd = c(1, 2, 2))
Видим, что мы действительно разделили тренд и сезонность.
Попробуем применить eossa к реальным данным по электричеству:
plot(s_electr, type = "vectors", idx = 1:20)
s_electr_e <- eossa(s_electr, nested.groups = list(c(1, 6, 11:13), c(2:5, 7:10)), k = 6)
plot(s_electr_e, type = "vectors", idx = 1:20)
## Warning in .contribution(x, idx, ...): Elementary matrices are not F-orthogonal
## (max F-cor is 0.00566). Contributions can be irrelevant
r_electr_e <- reconstruct(s_electr_e,
groups = list(Trend = c(1:3), Seasonality = c(4:13)))
plot(ts_electr_2)
lines(r_electr$Trend, col = "red", lty = 2, lwd = 2)
lines(r_electr_e$Trend, col = "blue", lty = 3, lwd = 2)
legend(x = "topleft", c("original", "SSA trend", "EOSSA trend"), col = c("black", "red", "blue"), lty = c(1, 2, 3), lwd = c(1, 2, 2))
plot(r_electr$Seasonality, lty = 1, lwd = 1)
lines(r_electr_e$Seasonality, col = "red", lty = 2, lwd = 2)
legend(x = "topleft", c("SSA seasonality", "EOSSA seasonality"), col = c("black", "red"), lty = c(1, 2), lwd = c(1, 2))
Видим, что в начале тренд выделился лучше.
Проверим новый вариант eossa на реальных данных:
s_electr_e_n <- eossa_new(s_electr, nested.groups = list(c(1, 6, 11:13), c(2:5, 7:10)), clust_type = "distance")
s_electr_e_n$iossa.groups
## $F1
## [1] 8 9
##
## $F2
## [1] 12 13
##
## $F3
## [1] 1 2 3
##
## $F4
## [1] 10 11
##
## $F5
## [1] 4 5 6 7
plot(s_electr_e_n, type = "vectors", idx = 1:20)
## Warning in .contribution(x, idx, ...): Elementary matrices are not F-orthogonal
## (max F-cor is 0.000277). Contributions can be irrelevant
r_electr_e_n <- reconstruct(s_electr_e_n,
groups = list(Trend = c(1:3), Seasonality = c(4:13)))
plot(ts_electr_2)
lines(r_electr$Trend, col = "red", lty = 2, lwd = 2)
lines(r_electr_e$Trend, col = "blue", lty = 3, lwd = 2)
lines(r_electr_e_n$Trend, col = "green", lty = 6, lwd = 2)
legend(x = "topleft", c("original", "SSA trend", "EOSSA trend", "EOSSA new trend"), col = c("black", "red", "blue", "green"), lty = c(1, 2, 3, 6), lwd = c(1, 2, 2, 2))
plot(r_electr$Seasonality, col = "red", lty = 1, lwd = 1)
lines(r_electr_e$Seasonality, col = "blue", lty = 2, lwd = 2)
lines(r_electr_e_n$Seasonality, col = "green", lty = 6, lwd = 2)
legend(x = "topleft", c("SSA seasonality", "EOSSA seasonality", "EOSSA new trend"), col = c("red", "blue", "green"), lty = c(1, 2, 6), lwd = c(1, 2, 2))
То же самое.
Применим декомпозицию ряда: выделив только один период 12 и периоды 10, 12 и 13:
dec_electr <- stats::decompose(ts_electr_2)
plot(dec_electr)
spec.pgram(na.exclude(dec_electr$random), taper = 0, log='no', fast = FALSE)
acf(na.exclude(dec_electr$random))
ts_electr_10 <- ts(as.vector(electricity_data$electr), frequency = 10)
dec_electr_10 <- stats::decompose(ts_electr_10)
plot(dec_electr_10)
spec.pgram(na.exclude(dec_electr_10$random), taper = 0, log='no', fast = FALSE)
acf(na.exclude(dec_electr_10$random))
ts_electr_12 <- ts(dec_electr_10$x - dec_electr_10$seasonal, frequency = 12)
dec_electr_12 <- stats::decompose(ts_electr_12)
plot(dec_electr_12)
spec.pgram(na.exclude(dec_electr_12$random), taper = 0, log='no', fast = FALSE)
acf(na.exclude(dec_electr_12$random))
ts_electr_13 <- ts(dec_electr_12$x - dec_electr_12$seasonal, frequency = 13)
dec_electr_13 <- stats::decompose(ts_electr_13)
plot(dec_electr_13)
spec.pgram(na.exclude(dec_electr_13$random), taper = 0, log='no', fast = FALSE)
acf(na.exclude(dec_electr_13$random))
Тренд для одного периода даже глаже.
Теперь применим loess декомпозицию:
stl_electr <- stl(ts_electr_2, s.window = 7, s.degree = 1)
plot(stl_electr)
spec.pgram(stl_electr$time.series[, 3], taper = 0, log='no', fast = FALSE)
acf(stl_electr$time.series[, 3])
Сравнение трендов декомпозиции, loess и SSA:
plot(as.vector(dec_electr$x), type = "l")
lines(as.vector(r_electr_auto_fossa$trend), type = "l", col = "purple")
lines(as.vector(dec_electr$trend), type = "l", col = "red")
lines(as.vector(stl_electr$time.series[, 2]), type = "l", col = "blue")
legend(x = "bottomright", c("series", "Fossa SSA", "decompose 12", "decompose 10, 12, 13"), lty = 1, lwd = 1, col = c("black", "purple", "red", "blue"))
spec.pgram(as.vector(residuals(r_electr_auto_fossa)), taper = 0, log='no', fast = FALSE)
acf(as.vector(residuals(r_electr_auto_fossa)))
Обрежем наш ряд на 2 периода:
ts_electr_sh <- ts(ts_electr_2[1:(length(ts_electr_2) - 2 * freq_12)], frequency = freq_12)
L_electr_sh <- length(ts_electr_sh) %/% 2 %/% 12 * 12
s_electr_sh <- ssa(ts_electr_sh, kind = "1d-ssa", svd.method = "svd", L = L_electr_sh)
plot(s_electr_sh, type = "vectors", idx = 1:20)
s_electr_sh_e <- eossa(s_electr_sh, nested.groups = list(c(1, 6, 11:13, 20), c(2:5, 7:10)), k = 7)
plot(s_electr_sh_e, type = "vectors", idx = 1:20)
## Warning in .contribution(x, idx, ...): Elementary matrices are not F-orthogonal
## (max F-cor is -0.0659). Contributions can be irrelevant
r_electr_sh_e <- reconstruct(s_electr_sh_e,
groups = list(Trend = c(1:3), Seasonality = c(4:13, 20)))
plot(r_electr_sh_e, add.residuals = TRUE, add.original = TRUE,
plot.method = "xyplot",
superpose = TRUE, auto.key = list(columns = 2))
spec.pgram(as.vector(residuals(r_electr_sh_e)), taper = 0, log='no', fast = FALSE)
И предскажем сначала весь сигнал и сравним с рядом:
forecast.len <- 2 * freq_12
rfor_electr <- rforecast(s_electr_sh_e, groups = list(1:13),
len = forecast.len)
vfor_electr <- vforecast(s_electr_sh_e, groups = list(1:13),
len = forecast.len)
plot(ts_electr_2, type = "l")
lines(ts(c(rep(NA, length(ts_electr_sh)), rfor_electr), frequency = freq_12), type = "l", col = "blue")
lines(ts(c(rep(NA, length(ts_electr_sh)), vfor_electr), frequency = freq_12), type = "l", col = "red")
legend(x = "bottom", c("SSA", "rforecast", "vforecast"), lty = 1, col = c("black", "blue", "red"))
plot(ts(ts_electr_2[(length(ts_electr_2) - 2 * freq_12 + 1):length(ts_electr_2)], frequency = freq_12), type = "l")
lines(ts(rfor_electr, frequency = freq_12), type = "l", col = "blue")
lines(ts(vfor_electr, frequency = freq_12), type = "l", col = "red")
legend(x = "bottom", c("SSA", "rforecast", "vforecast"), lty = 1, col = c("black", "blue", "red"))
Теперь только тренд:
rfor_electr_tr <- rforecast(s_electr_sh_e, groups = list(1:3),
len = forecast.len)
vfor_electr_tr <- vforecast(s_electr_sh_e, groups = list(1:3),
len = forecast.len)
plot(r_electr_e$Trend, type = "l")
lines(ts(c(rep(NA, length(r_electr_sh_e$Trend)), rfor_electr_tr), frequency = freq_12), type = "l", col = "blue")
lines(ts(c(rep(NA, length(r_electr_sh_e$Trend)), vfor_electr_tr), frequency = freq_12), type = "l", col = "red")
legend(x = "bottom", c("SSA", "rforecast", "vforecast"), lty = 1, col = c("black", "blue", "red"))
plot(ts(r_electr_e$Trend[(length(r_electr_e$Trend) - 2 * freq_12 + 1):length(r_electr_e$Trend)], frequency = freq_12), type = "l")
lines(ts(rfor_electr_tr, frequency = freq_12), type = "l", col = "blue")
lines(ts(vfor_electr_tr, frequency = freq_12), type = "l", col = "red")
legend(x = "bottom", c("SSA", "rforecast", "vforecast"), lty = 1, col = c("black", "blue", "red"))
Наконец, период:
rfor_electr_se <- rforecast(s_electr_sh_e, groups = list(4:13),
len = forecast.len)
vfor_electr_se <- vforecast(s_electr_sh_e, groups = list(4:13),
len = forecast.len)
plot(r_electr_e$Seasonality, type = "l")
lines(ts(c(rep(NA, length(r_electr_sh_e$Seasonality)), rfor_electr_se), frequency = freq_12), type = "l", col = "blue")
lines(ts(c(rep(NA, length(r_electr_sh_e$Seasonality)), vfor_electr_se), frequency = freq_12), type = "l", col = "red")
legend(x = "bottom", c("SSA", "rforecast", "vforecast"), lty = 1, col = c("black", "blue", "red"))
plot(ts(r_electr_e$Seasonality[(length(r_electr_e$Seasonality) - 2 * freq_12 + 1):length(r_electr_e$Seasonality)], frequency = freq_12), type = "l")
lines(ts(rfor_electr_se, frequency = freq_12), type = "l", col = "blue")
lines(ts(vfor_electr_se, frequency = freq_12), type = "l", col = "red")
legend(x = "bottom", c("SSA", "rforecast", "vforecast"), lty = 1, col = c("black", "blue", "red"))
Возьмём корни для ЛРФ:
par_electr_sh <- parestimate(s_electr_sh_e, groups = list(c(1:13)), method = "esprit")
par_electr_sh
## period rate | Mod Arg | Re Im
## Inf 0.001272 | 1.00127 0.00 | 1.00127 0.00000
## 5.995 -0.000437 | 0.99956 1.05 | 0.49900 0.86610
## -5.995 -0.000437 | 0.99956 -1.05 | 0.49900 -0.86610
## 12.000 -0.000909 | 0.99909 0.52 | 0.86523 0.49955
## -12.000 -0.000909 | 0.99909 -0.52 | 0.86523 -0.49955
## 4.002 -0.001734 | 0.99827 1.57 | 0.00079 0.99827
## -4.002 -0.001734 | 0.99827 -1.57 | 0.00079 -0.99827
## 2.399 -0.003505 | 0.99650 2.62 | -0.86348 0.49741
## -2.399 -0.003505 | 0.99650 -2.62 | -0.86348 -0.49741
## 2191.237 -0.021492 | 0.97874 0.00 | 0.97873 0.00281
## -2191.237 -0.021492 | 0.97874 -0.00 | 0.97873 -0.00281
## 13.036 -0.027175 | 0.97319 0.48 | 0.86231 0.45113
## -13.036 -0.027175 | 0.97319 -0.48 | 0.86231 -0.45113
Построим формулу и прогноз на 10 периодов:
list_formula_electr_sh <- find_explicit_formula_conj(ts_electr_sh, roots = par_electr_sh$roots)
list_formula_electr_comp_sh <- compress_sincos_to_cos(list_formula_electr_sh)
create_str_formula_conj(list_formula_electr_comp_sh)
## [1] "x _n = (103750.2691) * 1.0013 ^n + (12505.7378) * 0.9996 ^n * cos(2 * pi * n / 5.9948 + (-1.437)) + (13019.0011) * 0.9991 ^n * cos(2 * pi * n / 11.9998 + (2.1302)) + (3969.6153) * 0.9983 ^n * cos(2 * pi * n / 4.002 + (0.3681)) + (4268.9226) * 0.9965 ^n * cos(2 * pi * n / 2.3991 + (-1.6031)) + (309252.6649) * 0.9787 ^n * cos(2 * pi * n / 2191.2371 + (-1.6793)) + (9263.2443) * 0.9732 ^n * cos(2 * pi * n / 13.0355 + (-0.7867))"
ts_electr_sh_formula <- ts(model_series_by_formula_conj(list_formula_electr_comp_sh, length(ts_electr_sh) + 10 * freq_12), frequency = 12)
plot(ts_electr_sh_formula, lty = 3, lwd = 2, col = "green")
lines(r_electr_sh_e$Trend + r_electr_sh_e$Seasonality, lty = 6, lwd = 2, col = "purple")
lines(ts_electr_sh, type = "l", col = "red")
legend(x="bottom", legend = c("original", "ssa", "formula (compressed)"), lty = c(1, 3), col = c("red", "purple", "green"))
Так как у нас только один корень, который соответствует экспоненциальному поведению, то логично, что ряд будет возрастать постепенно с медленным увеличением скорости роста, что и видно на графике.
Построим бутстреп-доверительные интервалы для нашего предсказания (предсказательный интервал):
plot(s_electr_sh, type = "vectors", idx = 1:20)
for_electr <- forecast(s_electr_sh, groups = list(c(1:13)),
method = "vector", h = 2 * freq_12,
interval = "prediction",
level=c(0.8, 0.95))
plot(for_electr)
lines(ts_electr_2,col="black")
plot(for_electr, xlim = c(20, 22))
lines(ts_electr_2,col="black")
s_electr_sh_60 <- ssa(ts_electr_sh, L = 60, svd.method = "svd")
plot(s_electr_sh_60, type = "vectors", idx = 1:20)
for_electr_60 <- forecast(s_electr_sh_60, groups = list(c(1:10)),
method = "vector", h = 2 * freq_12,
interval = "prediction",
level=c(0.8, 0.95))
plot(for_electr_60, xlim = c(20, 22))
lines(ts_electr_2,col="black")
s_electr_sh_48 <- ssa(ts_electr_sh, L = 48, svd.method = "svd")
plot(s_electr_sh_48, type = "vectors", idx = 1:20)
for_electr_48 <- forecast(s_electr_sh_48, groups = list(c(1:8)),
method = "vector", h = 2 * freq_12,
interval = "prediction",
level=c(0.8, 0.95))
plot(for_electr_48, xlim = c(20, 22))
lines(ts_electr_2,col="black")
Видим, что реальные значения ряда полностью попадают даже в \(80\%\) предсказательный интервал.
Теперь доверительный интервал:
for_electr <- forecast(s_electr_sh, groups = list(c(1:13)),
method = "vector", h = 2 * freq_12,
interval = "confidence",
level=c(0.8, 0.95))
plot(for_electr)
lines(ts_electr_2,col="black")
plot(for_electr, xlim = c(20, 22))
lines(ts_electr_2,col="black")
Здесь уже видим, что далеко не все точки попадают в предсказательный интервал.
Сначала воспользуемся стандартным заполнением пропусков на основе разложения и предсказания:
ts_electr_hole <- ts_electr_2
ts_electr_hole[100:150] <- NA
plot(ts_electr_hole)
s_electr_hole <- ssa(ts_electr_hole, L = 60)
g_electr <- gapfill(s_electr_hole, groups = list(c(1, 6)), method = "sequential",
base = "reconstructed")
g0_electr <- gapfill(s_electr_hole, groups = list(c(1, 6)), method = "sequential",
alpha = 0, base = "reconstructed")
g1_electr <- gapfill(s_electr_hole, groups = list(c(1, 6)), method = "sequential",
alpha = 1, base = "reconstructed")
plot(ts_electr_2, col = "black")
lines(g0_electr, col = "blue", lwd = 2)
lines(g1_electr, col = "green", lwd = 2)
lines(g_electr, col = "red", lwd = 2)
g_electr_all <- gapfill(s_electr_hole, groups = list(c(1:13)), method = "sequential",
base = "reconstructed")
g0_electr_all <- gapfill(s_electr_hole, groups = list(c(1:13)), method = "sequential",
alpha = 0, base = "reconstructed")
g1_electr_all <- gapfill(s_electr_hole, groups = list(c(1:13)), method = "sequential",
alpha = 1, base = "reconstructed")
plot(ts_electr_2, col = "black")
lines(g0_electr_all, col = "blue", lwd = 2)
lines(g1_electr_all, col = "green", lwd = 2)
lines(g_electr_all, col = "red", lwd = 2)
Видим, что в месте, где у нас пропущены значения происходит скачок, поэтому восстановление слева выше реальных значений, а восстановление справа — ниже. Хотя в среднем значения достаточно близки к правде.
Рассмотрим итеративное заполнение пропусков:
ig_electr <- igapfill(s_electr_hole, groups = list(c(1, 6)),
base = "reconstructed")
igo_electr <- igapfill(s_electr_hole, groups = list(c(1, 6)),
base = "original")
plot(ts_electr_2, col="black")
lines(ig_electr, col = "blue", lwd = 1)
lines(igo_electr, col = "red", lwd = 1)
ig1_electr <- igapfill(s_electr_hole, groups = list(c(1, 6)),
base = "original", maxiter = 1)
ig5_electr <- igapfill(s_electr_hole, groups = list(c(1, 6)), fill = ig1_electr,
base = "original", maxiter = 4)
ig10_electr <- igapfill(s_electr_hole, groups = list(c(1, 6)), fill = ig5_electr,
base = "original", maxiter = 5)
init.lin <- ts_electr_2
init.lin[100:150] <- ts_electr_2[100] + (0:100) / 100 * (ts_electr_2[150] - ts_electr_2[100])
## Warning in NextMethod("[<-"): number of items to replace is not a multiple of
## replacement length
ig.lin_electr <- igapfill(s_electr_hole,
fill = init.lin,
groups = list(c(1, 6)),
base = "original", maxiter = 10)
# Compare the result
plot(ts_electr_2, col = "black")
lines(ig1_electr, col = "green", lwd = 1)
lines(ig5_electr, col = "blue", lwd = 1)
lines(ig10_electr, col = "red", lwd = 1)
lines(ig.lin_electr, col = "darkred", lwd = 1)
Заполненный тренд становится более выгнутым, подстраиваясь под реальные данные.
Посчитаем среднюю ошибку заполения в центре:
N_csn <- 252
ts_cos_s <- ts(exp(2 * (1:N_csn) / N_csn) + cos(2 * pi * (1:N_csn) / freq_12 + pi / 3), frequency = freq_12)
ts_cos_sn <- ts(ts_cos_s + 0.4 * rnorm(N_csn), frequency = freq_12)
plot(ts_cos_sn)
ts_cos_sn_hm <- ts_cos_sn
ts_cos_sn_hm[108:144] <- NA
lines(ts_cos_sn_hm, col = "red")
s_cos_sn_hm <- ssa(ts_cos_sn_hm, L = 5 * freq_12, svd.method = "svd")
plot(s_cos_sn_hm, type = "vectors", idx = 1:10)
g_cos_sn_hm <- gapfill(s_cos_sn_hm, groups = list(c(1, 2, 3)), method = "sequential",
base = "reconstructed")
ig_cos_sn_hm <- igapfill(s_cos_sn_hm, groups = list(c(1, 2, 3)),
base = "reconstructed")
plot(ts_cos_s, col="black")
lines(g_cos_sn_hm, col = "blue", lwd = 1)
lines(ig_cos_sn_hm, col = "red", lwd = 1)
legend(x = "topleft", c("original trend", "gapfill", "igapfill"), lty = 1, lwd = 1, col = c("black", "blue", "red"))
N_model <- 100
mse_g <- c()
mse_ig <- c()
for (i in 1:N_model) {
ts_cos_s <- ts(exp(2 * (1:N_csn) / N_csn) + cos(2 * pi * (1:N_csn) / freq_12 + pi / 3), frequency = freq_12)
ts_cos_sn <- ts(ts_cos_s + 0.4 * rnorm(N_csn), frequency = freq_12)
ts_cos_sn_hm <- ts_cos_sn
ts_cos_sn_hm[108:144] <- NA
s_cos_sn_hm <- ssa(ts_cos_sn_hm, L = 5 * freq_12, svd.method = "svd")
g_cos_sn_hm <- gapfill(s_cos_sn_hm, groups = list(c(1, 2, 3)), method = "sequential",
base = "reconstructed")
ig_cos_sn_hm <- igapfill(s_cos_sn_hm, groups = list(c(1, 2, 3)),
base = "reconstructed")
mse_g <- c(mse_g, mean((ts_cos_s[108:144] - g_cos_sn_hm[108:144]) ** 2))
mse_ig <- c(mse_ig, mean((ts_cos_s[108:144] - ig_cos_sn_hm[108:144]) ** 2))
}
cat("gapfill ", mean(mse_g), "\n")
## gapfill 0.008221212
cat("iterative gapfill ", mean(mse_ig), "\n")
## iterative gapfill 0.003989773
Итеративный способ лучше:
t.test(mse_g, mse_ig)
##
## Welch Two Sample t-test
##
## data: mse_g and mse_ig
## t = 6.8524, df = 170.4, p-value = 1.271e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.003012474 0.005450403
## sample estimates:
## mean of x mean of y
## 0.008221212 0.003989773
А теперь на краях:
plot(ts_cos_sn)
ts_cos_sn_he <- ts_cos_sn
ts_cos_sn_he[216:252] <- NA
lines(ts_cos_sn_he, col = "red")
s_cos_sn_he <- ssa(ts_cos_sn_he, L = 5 * freq_12, svd.method = "svd")
plot(s_cos_sn_he, type = "vectors", idx = 1:10)
g_cos_sn_he <- gapfill(s_cos_sn_he, groups = list(c(1, 2, 3)), method = "sequential",
base = "reconstructed")
ig_cos_sn_he <- igapfill(s_cos_sn_he, groups = list(c(1, 2, 3)),
base = "reconstructed")
plot(ts_cos_s, col="black")
lines(g_cos_sn_he, col = "blue", lwd = 1)
lines(ig_cos_sn_he, col = "red", lwd = 1)
legend(x = "topleft", c("original trend", "gapfill", "igapfill"), lty = 1, lwd = 1, col = c("black", "blue", "red"))
N_model <- 100
mse_g_e <- c()
mse_ig_e <- c()
for (i in 1:N_model) {
ts_cos_s <- ts(exp(2 * (1:N_csn) / N_csn) + cos(2 * pi * (1:N_csn) / freq_12 + pi / 3), frequency = freq_12)
ts_cos_sn <- ts(ts_cos_s + 0.4 * rnorm(N_csn), frequency = freq_12)
ts_cos_sn_he <- ts_cos_sn
ts_cos_sn_he[216:252] <- NA
s_cos_sn_he <- ssa(ts_cos_sn_he, L = 5 * freq_12, svd.method = "svd")
g_cos_sn_he <- gapfill(s_cos_sn_he, groups = list(c(1, 2, 3)), method = "sequential",
base = "reconstructed")
ig_cos_sn_he <- igapfill(s_cos_sn_he, groups = list(c(1, 2, 3)),
base = "reconstructed")
mse_g_e <- c(mse_g_e, mean((ts_cos_s[216:252] - g_cos_sn_he[216:252]) ** 2))
mse_ig_e <- c(mse_ig_e, mean((ts_cos_s[216:252] - ig_cos_sn_he[216:252]) ** 2))
}
cat("gapfill ", mean(mse_g_e), "\n")
## gapfill 0.01978561
cat("iterative gapfill ", mean(mse_ig_e), "\n")
## iterative gapfill 0.02124055
Лучше обычный gapfill (правда, не значимо):
t.test(mse_g_e, mse_ig_e)
##
## Welch Two Sample t-test
##
## data: mse_g_e and mse_ig_e
## t = -0.56683, df = 193.53, p-value = 0.5715
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.006517441 0.003607571
## sample estimates:
## mean of x mean of y
## 0.01978561 0.02124055
Смоделируем ARIMA ряды с разными параметрами. Для начала чисто авторегрессионный ряд первого порядка:
set.seed(42)
ts_sim_100 <- arima.sim(list(order = c(1,0,0), ar = 0.7), n = 1000)
plot(ts_sim_100)
acf(ts_sim_100, lag.max = 50)
pacf(ts_sim_100)
auto.arima(ts_sim_100)
## Series: ts_sim_100
## ARIMA(1,0,0) with zero mean
##
## Coefficients:
## ar1
## 0.6944
## s.e. 0.0230
##
## sigma^2 = 1.009: log likelihood = -1423.06
## AIC=2850.12 AICc=2850.13 BIC=2859.94
Видим характерное поведение ACF и PACF, а также верная оценка по auto.arima. Теперь модель скользящего среднего первого порядка:
set.seed(42)
ts_sim_001 <- arima.sim(list(order = c(0,0,1), ma = 0.7), n = 1000)
plot(ts_sim_001)
acf(ts_sim_001, lag.max = 100)
pacf(ts_sim_001)
auto.arima(ts_sim_001)
## Series: ts_sim_001
## ARIMA(0,0,1) with zero mean
##
## Coefficients:
## ma1
## 0.7070
## s.e. 0.0223
##
## sigma^2 = 1.009: log likelihood = -1423.13
## AIC=2850.25 AICc=2850.26 BIC=2860.07
Смоделируем что-то посложнее:
set.seed(42)
ts_sim_523 <- arima.sim(list(order = c(5, 2, 3), ar = c(0.28, -0.2, -0.3, 0.1, 0.4), ma = c(0.7, 0.8, -0.3)), n = 1000)
plot(ts_sim_523)
plot(ts_sim_523 |> diff() |> diff())
acf(ts_sim_523 |> diff() |> diff(), lag.max = 100)
pacf(ts_sim_523 |> diff() |> diff())
auto.arima(ts_sim_523)
## Series: ts_sim_523
## ARIMA(5,2,3)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.2380 -0.2210 -0.2826 0.0634 0.3951 0.6843 0.7414 -0.2168
## s.e. 0.0788 0.0332 0.0418 0.0484 0.0314 0.0855 0.0752 0.0818
##
## sigma^2 = 1.172: log likelihood = -1498.6
## AIC=3015.2 AICc=3015.39 BIC=3059.37
Попробуем проанализировать некоторые примеры рядов:
ts4 <- ts(read_csv("arima_model/ts4.csv"))
## Rows: 3001 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): x
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ts8 <- ts(read_csv("arima_model/ts8.csv"))
## Rows: 1000 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): x
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Для данного ряда тест Dickey-Fuller говорит о том, что стационарность есть, однако auto.arima определяет порядок дифференцирования \(1\), поэтому выберем параметр \(d = 1\):
plot(ts4)
adf.test(ts4)
##
## Augmented Dickey-Fuller Test
##
## data: ts4
## Dickey-Fuller = -3.5558, Lag order = 14, p-value = 0.0369
## alternative hypothesis: stationary
plot(ts4 |> diff())
acf(ts4 |> diff(), lag.max = 100)
pacf(ts4 |> diff())
auto.arima(ts4)
## Series: ts4
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.0318 0.7992
## s.e. 0.0225 0.0135
##
## sigma^2 = 1.004: log likelihood = -4261.67
## AIC=8529.35 AICc=8529.35 BIC=8547.36
(fit4 <- Arima(ts4, order=c(0,1,1)))
## Series: ts4
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## 0.8100
## s.e. 0.0104
##
## sigma^2 = 1.004: log likelihood = -4262.67
## AIC=8529.34 AICc=8529.35 BIC=8541.35
plot(fit4)
plot(forecast(fit4, h = 50), xlim = c(2900, 3060))
Для ряда ниже получили модель авторегрессии третьего порядка:
plot(ts8)
adf.test(ts8)
## Warning in adf.test(ts8): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: ts8
## Dickey-Fuller = -10.044, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
acf(ts8, lag.max = 100)
pacf(ts8)
auto.arima(ts8)
## Series: ts8
## ARIMA(3,0,0) with zero mean
##
## Coefficients:
## ar1 ar2 ar3
## 0.2085 0.1075 -0.0770
## s.e. 0.0315 0.0321 0.0316
##
## sigma^2 = 1.913: log likelihood = -1741.9
## AIC=3491.81 AICc=3491.85 BIC=3511.44
(fit8 <- Arima(ts8, order=c(3,0,0)))
## Series: ts8
## ARIMA(3,0,0) with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 mean
## 0.2072 0.1064 -0.0783 -0.0650
## s.e. 0.0315 0.0321 0.0316 0.0571
##
## sigma^2 = 1.913: log likelihood = -1741.26
## AIC=3492.52 AICc=3492.58 BIC=3517.06
plot(fit8)
plot(forecast(fit8, h = 10), xlim = c(990, 1020))
Определим параметры ARIMA для данных с электричеством (без сезонности) и построим предсказание с доверительными интервалами:
ts_electr_sh_tr <- ts(ts_electr_sh - r_electr_sh_e$Seasonality)
plot(ts_electr_sh_tr)
acf(ts_electr_sh_tr |> diff())
pacf(ts_electr_sh_tr |> diff())
auto.arima(ts_electr_sh_tr, stepwise = FALSE, trace = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(0,1,0) : 4431.404
## ARIMA(0,1,0) with drift : 4432.508
## ARIMA(0,1,1) : 4384.042
## ARIMA(0,1,1) with drift : 4380.795
## ARIMA(0,1,2) : 4386.096
## ARIMA(0,1,2) with drift : 4382.652
## ARIMA(0,1,3) : 4387.473
## ARIMA(0,1,3) with drift : 4382.999
## ARIMA(0,1,4) : 4389.478
## ARIMA(0,1,4) with drift : 4385.062
## ARIMA(0,1,5) : 4390.834
## ARIMA(0,1,5) with drift : 4385.707
## ARIMA(1,1,0) : 4393.07
## ARIMA(1,1,0) with drift : 4392.754
## ARIMA(1,1,1) : 4387.176
## ARIMA(1,1,1) with drift : 4383.606
## ARIMA(1,1,2) : Inf
## ARIMA(1,1,2) with drift : 4384.993
## ARIMA(1,1,3) : Inf
## ARIMA(1,1,3) with drift : Inf
## ARIMA(1,1,4) : Inf
## ARIMA(1,1,4) with drift : Inf
## ARIMA(2,1,0) : 4391.875
## ARIMA(2,1,0) with drift : 4390.799
## ARIMA(2,1,1) : 4389.204
## ARIMA(2,1,1) with drift : 4384.253
## ARIMA(2,1,2) : Inf
## ARIMA(2,1,2) with drift : Inf
## ARIMA(2,1,3) : Inf
## ARIMA(2,1,3) with drift : Inf
## ARIMA(3,1,0) : 4389.396
## ARIMA(3,1,0) with drift : 4387.276
## ARIMA(3,1,1) : 4391.555
## ARIMA(3,1,1) with drift : 4386.813
## ARIMA(3,1,2) : 4381.093
## ARIMA(3,1,2) with drift : Inf
## ARIMA(4,1,0) : 4392.356
## ARIMA(4,1,0) with drift : 4390.384
## ARIMA(4,1,1) : Inf
## ARIMA(4,1,1) with drift : 4389.07
## ARIMA(5,1,0) : 4389.293
## ARIMA(5,1,0) with drift : 4385.331
##
## Now re-fitting the best model(s) without approximations...
##
##
##
##
## Best model: ARIMA(0,1,1) with drift
## Series: ts_electr_sh_tr
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## -0.5323 288.2537
## s.e. 0.0621 119.8871
##
## sigma^2 = 14894480: log likelihood = -2195.88
## AIC=4397.77 AICc=4397.88 BIC=4408.04
(fit_elect_arima_tr <- Arima(ts_electr_sh_tr, order=c(0,1,1), include.drift = TRUE))
## Series: ts_electr_sh_tr
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## -0.5323 288.2537
## s.e. 0.0621 119.8871
##
## sigma^2 = 14894480: log likelihood = -2195.88
## AIC=4397.77 AICc=4397.88 BIC=4408.04
plot(forecast(fit_elect_arima_tr, h = 2 * freq_12))
lines(ts(ts_electr_2 - r_electr_e$Seasonality))
plot(forecast(fit_elect_arima_tr, h = 2 * freq_12), xlim = c(200, 260))
lines(ts(ts_electr_2 - r_electr_e$Seasonality))
Добавим сезонность и применим Seasonal ARIMA:
plot(ts_electr_sh)
auto.arima(ts_electr_sh, stepwise = FALSE, trace = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(0,1,0)(0,1,0)[12] : 4126.288
## ARIMA(0,1,0)(0,1,1)[12] : 4038.829
## ARIMA(0,1,0)(0,1,2)[12] : 4040.837
## ARIMA(0,1,0)(1,1,0)[12] : 4078.156
## ARIMA(0,1,0)(1,1,1)[12] : 4038.75
## ARIMA(0,1,0)(1,1,2)[12] : 4039.319
## ARIMA(0,1,0)(2,1,0)[12] : 4061.196
## ARIMA(0,1,0)(2,1,1)[12] : 4043.188
## ARIMA(0,1,0)(2,1,2)[12] : 4037.291
## ARIMA(0,1,1)(0,1,0)[12] : 4098.766
## ARIMA(0,1,1)(0,1,1)[12] : 4022.407
## ARIMA(0,1,1)(0,1,2)[12] : 4024.483
## ARIMA(0,1,1)(1,1,0)[12] : 4056.01
## ARIMA(0,1,1)(1,1,1)[12] : 4020.022
## ARIMA(0,1,1)(1,1,2)[12] : 4021.072
## ARIMA(0,1,1)(2,1,0)[12] : 4039.773
## ARIMA(0,1,1)(2,1,1)[12] : 4020.581
## ARIMA(0,1,1)(2,1,2)[12] : 4013.566
## ARIMA(0,1,2)(0,1,0)[12] : 4095.577
## ARIMA(0,1,2)(0,1,1)[12] : 4019.584
## ARIMA(0,1,2)(0,1,2)[12] : 4021.655
## ARIMA(0,1,2)(1,1,0)[12] : 4051.194
## ARIMA(0,1,2)(1,1,1)[12] : 4020.562
## ARIMA(0,1,2)(1,1,2)[12] : 4020.725
## ARIMA(0,1,2)(2,1,0)[12] : 4035.716
## ARIMA(0,1,2)(2,1,1)[12] : 4016.642
## ARIMA(0,1,3)(0,1,0)[12] : 4097.653
## ARIMA(0,1,3)(0,1,1)[12] : 4017.144
## ARIMA(0,1,3)(0,1,2)[12] : 4019.229
## ARIMA(0,1,3)(1,1,0)[12] : 4052.301
## ARIMA(0,1,3)(1,1,1)[12] : 4021.231
## ARIMA(0,1,3)(2,1,0)[12] : 4030.255
## ARIMA(0,1,4)(0,1,0)[12] : 4091.831
## ARIMA(0,1,4)(0,1,1)[12] : 4015.122
## ARIMA(0,1,4)(1,1,0)[12] : 4045.984
## ARIMA(0,1,5)(0,1,0)[12] : 4088.909
## ARIMA(1,1,0)(0,1,0)[12] : 4111.161
## ARIMA(1,1,0)(0,1,1)[12] : 4028.755
## ARIMA(1,1,0)(0,1,2)[12] : 4030.765
## ARIMA(1,1,0)(1,1,0)[12] : 4064.662
## ARIMA(1,1,0)(1,1,1)[12] : 4022.238
## ARIMA(1,1,0)(1,1,2)[12] : 4023.962
## ARIMA(1,1,0)(2,1,0)[12] : 4045.793
## ARIMA(1,1,0)(2,1,1)[12] : 4029.713
## ARIMA(1,1,0)(2,1,2)[12] : 4022.232
## ARIMA(1,1,1)(0,1,0)[12] : 4088.133
## ARIMA(1,1,1)(0,1,1)[12] : 4014.248
## ARIMA(1,1,1)(0,1,2)[12] : 4016.285
## ARIMA(1,1,1)(1,1,0)[12] : 4040.518
## ARIMA(1,1,1)(1,1,1)[12] : 4005.488
## ARIMA(1,1,1)(1,1,2)[12] : Inf
## ARIMA(1,1,1)(2,1,0)[12] : 4029.036
## ARIMA(1,1,1)(2,1,1)[12] : 4013.708
## ARIMA(1,1,2)(0,1,0)[12] : 4088.527
## ARIMA(1,1,2)(0,1,1)[12] : 4016.222
## ARIMA(1,1,2)(0,1,2)[12] : 4018.276
## ARIMA(1,1,2)(1,1,0)[12] : 4039.824
## ARIMA(1,1,2)(1,1,1)[12] : 3999.181
## ARIMA(1,1,2)(2,1,0)[12] : 4031.1
## ARIMA(1,1,3)(0,1,0)[12] : Inf
## ARIMA(1,1,3)(0,1,1)[12] : 4017.968
## ARIMA(1,1,3)(1,1,0)[12] : Inf
## ARIMA(1,1,4)(0,1,0)[12] : 4088.167
## ARIMA(2,1,0)(0,1,0)[12] : 4099.598
## ARIMA(2,1,0)(0,1,1)[12] : 4028.851
## ARIMA(2,1,0)(0,1,2)[12] : 4030.944
## ARIMA(2,1,0)(1,1,0)[12] : 4056.21
## ARIMA(2,1,0)(1,1,1)[12] : 4016.558
## ARIMA(2,1,0)(1,1,2)[12] : 4018.461
## ARIMA(2,1,0)(2,1,0)[12] : 4044.096
## ARIMA(2,1,0)(2,1,1)[12] : 4028.23
## ARIMA(2,1,1)(0,1,0)[12] : 4090.718
## ARIMA(2,1,1)(0,1,1)[12] : 4018.898
## ARIMA(2,1,1)(0,1,2)[12] : 4020.954
## ARIMA(2,1,1)(1,1,0)[12] : 4041.251
## ARIMA(2,1,1)(1,1,1)[12] : 4002.829
## ARIMA(2,1,1)(2,1,0)[12] : 4026.985
## ARIMA(2,1,2)(0,1,0)[12] : Inf
## ARIMA(2,1,2)(0,1,1)[12] : 4019.774
## ARIMA(2,1,2)(1,1,0)[12] : Inf
## ARIMA(2,1,3)(0,1,0)[12] : 4089.87
## ARIMA(3,1,0)(0,1,0)[12] : 4102.033
## ARIMA(3,1,0)(0,1,1)[12] : 4031.521
## ARIMA(3,1,0)(0,1,2)[12] : 4033.634
## ARIMA(3,1,0)(1,1,0)[12] : 4059.055
## ARIMA(3,1,0)(1,1,1)[12] : 4014.626
## ARIMA(3,1,0)(2,1,0)[12] : 4046.664
## ARIMA(3,1,1)(0,1,0)[12] : 4092.144
## ARIMA(3,1,1)(0,1,1)[12] : 4022.652
## ARIMA(3,1,1)(1,1,0)[12] : 4042.467
## ARIMA(3,1,2)(0,1,0)[12] : 4093.228
## ARIMA(4,1,0)(0,1,0)[12] : 4104.442
## ARIMA(4,1,0)(0,1,1)[12] : 4032.715
## ARIMA(4,1,0)(1,1,0)[12] : 4058.912
## ARIMA(4,1,1)(0,1,0)[12] : 4090.616
## ARIMA(5,1,0)(0,1,0)[12] : 4104.773
##
## Now re-fitting the best model(s) without approximations...
##
##
##
##
## Best model: ARIMA(1,1,2)(1,1,1)[12]
## Series: ts_electr_sh
## ARIMA(1,1,2)(1,1,1)[12]
##
## Coefficients:
## ar1 ma1 ma2 sar1 sma1
## 0.6347 -1.0023 0.1123 0.0636 -0.8325
## s.e. 0.1396 0.1574 0.1135 0.1000 0.0933
##
## sigma^2 = 19234185: log likelihood = -2112.38
## AIC=4236.76 AICc=4237.17 BIC=4256.99
(fit_elect_arima <- Arima(ts_electr_sh, order=c(1,1,2), seasonal = c(1,1,1)))
## Series: ts_electr_sh
## ARIMA(1,1,2)(1,1,1)[12]
##
## Coefficients:
## ar1 ma1 ma2 sar1 sma1
## 0.6347 -1.0023 0.1123 0.0636 -0.8325
## s.e. 0.1396 0.1574 0.1135 0.1000 0.0933
##
## sigma^2 = 19234185: log likelihood = -2112.38
## AIC=4236.76 AICc=4237.17 BIC=4256.99
f_elect_arima <- forecast(fit_elect_arima, h = 2 * freq_12)
plot(f_elect_arima)
lines(ts_electr_2)
plot(f_elect_arima, xlim = c(20, 22))
lines(ts_electr_2)
Вычислим ошибки предсказания для SSA и ARIMA:
f_part_electr <- ts_electr_2[(length(ts_electr_2) - 2 * freq_12 + 1):length(ts_electr_2)]
cat("SSA реккурентное предсказание: ", mean((rfor_electr - f_part_electr) ** 2), "\n")
## SSA реккурентное предсказание: 11806850
cat("SSA векторное предсказание: ", mean((vfor_electr - f_part_electr) ** 2), "\n")
## SSA векторное предсказание: 11864546
cat("Seasonal ARIMA предсказание", mean((f_elect_arima$mean - f_part_electr) ** 2), "\n")
## Seasonal ARIMA предсказание 21289575
Сгенерируем ряд с временной разладкой:
N_temp <- 252
ts_temp_break <- ts(c(1:(N_temp %/% 2), N_temp %/% 2 - 1:(N_temp %/% 2)) + rnorm(N_temp))
plot(ts_temp_break)
Построим матрицы неоднородности для разных длин окон:
rank_temp <- 1
M_temp <- 48; L_temp <- M_temp / 2
hm48_temp <- hmatr(ts_temp_break, B = M_temp, T = M_temp, L = L_temp, neig = rank_temp)
plot(hm48_temp)
Наконец, нарисуем функции разладки для разных методов перемещения окон:
path_hor_temp <- c()
path_diag_temp <- c()
path_sdiag_temp <- c()
for (i in 1:ncol(hm48_temp)) {
path_hor_temp <- c(path_hor_temp, hm48_temp[i, 20])
path_diag_temp <- c(path_diag_temp, hm48_temp[i, i])
if (i + 60 < ncol(hm48_temp))
path_sdiag_temp <- c(path_sdiag_temp, hm48_temp[i + 60, i])
}
par(mfrow = c(4, 1))
plot(path_hor_temp, type = "l", main = "Horizontal")
plot(path_diag_temp, type = "l", main = "Diagonal")
plot(c(rep(NA, 30), path_sdiag_temp, rep(NA, 30)), type = "l", main = "Conjugated")
plot(ts_temp_break, main = "Series")
Сгенерируем ряд со следующей разладкой: сначала сумма косинусов с периодами \(7\) и \(12\), а потом остаётся только косинус с периодом \(12\):
N_cos_break <- 252
ts_cos_break <- ts(c(1 / 2 * cos(2 * pi * ((N_cos_break %/% 2 + 1):N_cos_break) / 12) + 1 / 2 * cos(2 * pi * ((N_cos_break %/% 2 + 1):N_cos_break) / 7), cos(2 * pi * (1:(N_cos_break %/% 2)) / 12)))
plot(ts_cos_break)
Построим матрицы неоднородности для разных длин окон:
rank_cos_break <- 4
M_cos_break <- 48; L_cos_break <- M_cos_break / 2
hm48_cos_break <- hmatr(ts_cos_break, B = M_cos_break, T = M_cos_break, L = L_cos_break, neig = rank_cos_break)
plot(hm48_cos_break)
Наконец, нарисуем функции разладки для разных методов перемещения окон:
path_hor_cos_break <- c()
path_diag_cos_break <- c()
path_sdiag_cos_break <- c()
for (i in 1:ncol(hm48_cos_break)) {
path_hor_cos_break <- c(path_hor_cos_break, hm48_cos_break[i, 20])
path_diag_cos_break <- c(path_diag_cos_break, hm48_cos_break[i, i])
if (i + 60 < ncol(hm48_cos_break))
path_sdiag_cos_break <- c(path_sdiag_cos_break, hm48_cos_break[i + 60, i])
}
par(mfrow = c(4, 1))
plot(path_hor_cos_break, type = "l", main = "Horizontal")
plot(path_diag_cos_break, type = "l", main = "Diagonal")
plot(c(rep(NA, 30), path_sdiag_cos_break, rep(NA, 30)), type = "l", main = "Conjugated")
plot(ts_cos_break, main = "Series")
Применим матрицу неоднородности к реальному ряду:
rank_electr <- 10
M_electr <- 48; L_electr <- M_electr / 2
hm48_electr <- hmatr(ts_electr_2, B = M_electr, T = M_electr, L = L_electr, neig = rank_electr)
plot(hm48_electr)
Нарисуем функции разладки для разных методов перемещения окон:
path_hor_electr <- c()
path_diag_electr <- c()
path_sdiag_electr <- c()
for (i in 1:ncol(hm48_electr)) {
path_hor_electr <- c(path_hor_electr, hm48_electr[i, 20])
path_diag_electr <- c(path_diag_electr, hm48_electr[i, i])
if (i + 60 < ncol(hm48_electr))
path_sdiag_electr <- c(path_sdiag_electr, hm48_electr[i + 60, i])
}
par(mfrow = c(4, 1))
plot(path_hor_electr, type = "l", main = "Horizontal")
plot(path_diag_electr, type = "l", main = "Diagonal")
plot(c(rep(NA, 30), path_sdiag_electr, rep(NA, 30)), type = "l", main = "Conjugated")
plot(ts_electr_2, main = "Series")